forked from D-Net/dnet-hadoop
WIP stable ids: IdentifierFactory & unit test
This commit is contained in:
parent
642b459552
commit
1abcabb6e6
|
@ -5,39 +5,20 @@ import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Factory class for OpenAIRE identifiers in the Graph
|
||||
*/
|
||||
public class IdentifierFactory implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(IdentifierFactory.class);
|
||||
|
||||
public static final String ID_SEPARATOR = "::";
|
||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||
public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$";
|
||||
public static final int ID_PREFIX_LEN = 12;
|
||||
|
||||
public static Set<String> acceptedPidTypes = new HashSet<>();
|
||||
|
||||
static {
|
||||
acceptedPidTypes.add("doi");
|
||||
acceptedPidTypes.add("doi");
|
||||
acceptedPidTypes.add("doi");
|
||||
acceptedPidTypes.add("doi");
|
||||
acceptedPidTypes.add("doi");
|
||||
acceptedPidTypes.add("doi");
|
||||
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
||||
|
@ -48,14 +29,14 @@ public class IdentifierFactory implements Serializable {
|
|||
.getPid()
|
||||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> acceptedPidTypes.contains(s.getQualifier().getClassid()))
|
||||
.max(new PidComparator<T>(entity))
|
||||
.filter(s -> PidType.isValid(s.getQualifier().getClassid()))
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.map(IdentifierFactory::verifyIdSyntax)
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
protected static String verifyIdSyntax(String s) {
|
||||
private static String verifyIdSyntax(String s) {
|
||||
if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||
} else {
|
||||
|
@ -74,7 +55,7 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
|
||||
private static String normalizePidValue(String value) {
|
||||
//TODO more aggressive cleaning? keep only alphanum and punctation?
|
||||
//TODO more aggressive cleaning? keep only alphanum and punctuation?
|
||||
return value.toLowerCase().replaceAll(" ", "");
|
||||
}
|
||||
|
||||
|
|
|
@ -27,8 +27,8 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
|||
if (right == null)
|
||||
return -1;
|
||||
|
||||
String lClass = left.getQualifier().getClassid();
|
||||
String rClass = right.getQualifier().getClassid();
|
||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
@ -44,39 +44,69 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
|||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
private int compareResultPids(String lClass, String rClass) {
|
||||
if (lClass.equals("doi"))
|
||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||
if (lClass.equals(PidType.doi))
|
||||
return -1;
|
||||
if (rClass.equals("doi"))
|
||||
if (rClass.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("pmid"))
|
||||
if (lClass.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (rClass.equals("pmid"))
|
||||
if (rClass.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("pmc"))
|
||||
if (lClass.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (rClass.equals("pmc"))
|
||||
if (rClass.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.handle))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.NCID))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.NCID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.GBIF))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.GBIF))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.nct))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int compareOrganizationtPids(String lClass, String rClass) {
|
||||
if (lClass.equals("GRID"))
|
||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||
if (lClass.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (rClass.equals("GRID"))
|
||||
if (rClass.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("mag_id"))
|
||||
if (lClass.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (rClass.equals("mag_id"))
|
||||
if (rClass.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("urn"))
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (rClass.equals("urn"))
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.apache.commons.lang3.EnumUtils;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
// Result
|
||||
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||
|
||||
// Organization
|
||||
GRID, mag_id, urn;
|
||||
|
||||
public static boolean isValid(String type) {
|
||||
return EnumUtils.isValidEnum(PidType.class, type);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class IdentifierFactoryTest {
|
||||
|
||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
public void testCreateIdentifierForPublication() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||
verifyIdentifier("publication_urn.json", "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID);
|
||||
verifyIdentifier("publication_4.json", defaultID);
|
||||
verifyIdentifier("publication_5.json", defaultID);
|
||||
}
|
||||
|
||||
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||
|
||||
String id = IdentifierFactory.createIdentifier(pub);
|
||||
|
||||
assertNotNull(id);
|
||||
assertEquals(expectedID, id);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]}
|
Loading…
Reference in New Issue