WIP stable ids: IdentifierFactory & unit test

This commit is contained in:
Claudio Atzori 2020-10-06 18:55:23 +02:00
parent 642b459552
commit 1abcabb6e6
10 changed files with 116 additions and 39 deletions

View File

@ -5,39 +5,20 @@ import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashSet;
import java.util.Objects; import java.util.Objects;
import java.util.Set;
/** /**
* Factory class for OpenAIRE identifiers in the Graph * Factory class for OpenAIRE identifiers in the Graph
*/ */
public class IdentifierFactory implements Serializable { public class IdentifierFactory implements Serializable {
private static final Logger log = LoggerFactory.getLogger(IdentifierFactory.class);
public static final String ID_SEPARATOR = "::"; public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|"; public static final String ID_PREFIX_SEPARATOR = "|";
public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$"; public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$";
public static final int ID_PREFIX_LEN = 12; public static final int ID_PREFIX_LEN = 12;
public static Set<String> acceptedPidTypes = new HashSet<>();
static {
acceptedPidTypes.add("doi");
acceptedPidTypes.add("doi");
acceptedPidTypes.add("doi");
acceptedPidTypes.add("doi");
acceptedPidTypes.add("doi");
acceptedPidTypes.add("doi");
}
public static <T extends OafEntity> String createIdentifier(T entity) { public static <T extends OafEntity> String createIdentifier(T entity) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
@ -48,14 +29,14 @@ public class IdentifierFactory implements Serializable {
.getPid() .getPid()
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> acceptedPidTypes.contains(s.getQualifier().getClassid())) .filter(s -> PidType.isValid(s.getQualifier().getClassid()))
.max(new PidComparator<T>(entity)) .min(new PidComparator<>(entity))
.map(s -> idFromPid(entity, s)) .map(s -> idFromPid(entity, s))
.map(IdentifierFactory::verifyIdSyntax) .map(IdentifierFactory::verifyIdSyntax)
.orElseGet(entity::getId); .orElseGet(entity::getId);
} }
protected static String verifyIdSyntax(String s) { private static String verifyIdSyntax(String s) {
if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
throw new RuntimeException(String.format("malformed id: '%s'", s)); throw new RuntimeException(String.format("malformed id: '%s'", s));
} else { } else {
@ -74,7 +55,7 @@ public class IdentifierFactory implements Serializable {
} }
private static String normalizePidValue(String value) { private static String normalizePidValue(String value) {
//TODO more aggressive cleaning? keep only alphanum and punctation? //TODO more aggressive cleaning? keep only alphanum and punctuation?
return value.toLowerCase().replaceAll(" ", ""); return value.toLowerCase().replaceAll(" ", "");
} }

View File

@ -27,8 +27,8 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
if (right == null) if (right == null)
return -1; return -1;
String lClass = left.getQualifier().getClassid(); PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
String rClass = right.getQualifier().getClassid(); PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass)) if (lClass.equals(rClass))
return 0; return 0;
@ -44,39 +44,69 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
return lClass.compareTo(rClass); return lClass.compareTo(rClass);
} }
private int compareResultPids(String lClass, String rClass) { private int compareResultPids(PidType lClass, PidType rClass) {
if (lClass.equals("doi")) if (lClass.equals(PidType.doi))
return -1; return -1;
if (rClass.equals("doi")) if (rClass.equals(PidType.doi))
return 1; return 1;
if (lClass.equals("pmid")) if (lClass.equals(PidType.pmid))
return -1; return -1;
if (rClass.equals("pmid")) if (rClass.equals(PidType.pmid))
return 1; return 1;
if (lClass.equals("pmc")) if (lClass.equals(PidType.pmc))
return -1; return -1;
if (rClass.equals("pmc")) if (rClass.equals(PidType.pmc))
return 1;
if (lClass.equals(PidType.handle))
return -1;
if (rClass.equals(PidType.handle))
return 1;
if (lClass.equals(PidType.arXiv))
return -1;
if (rClass.equals(PidType.arXiv))
return 1;
if (lClass.equals(PidType.NCID))
return -1;
if (rClass.equals(PidType.NCID))
return 1;
if (lClass.equals(PidType.GBIF))
return -1;
if (rClass.equals(PidType.GBIF))
return 1;
if (lClass.equals(PidType.nct))
return -1;
if (rClass.equals(PidType.nct))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1; return 1;
return 0; return 0;
} }
private int compareOrganizationtPids(String lClass, String rClass) { private int compareOrganizationtPids(PidType lClass, PidType rClass) {
if (lClass.equals("GRID")) if (lClass.equals(PidType.GRID))
return -1; return -1;
if (rClass.equals("GRID")) if (rClass.equals(PidType.GRID))
return 1; return 1;
if (lClass.equals("mag_id")) if (lClass.equals(PidType.mag_id))
return -1; return -1;
if (rClass.equals("mag_id")) if (rClass.equals(PidType.mag_id))
return 1; return 1;
if (lClass.equals("urn")) if (lClass.equals(PidType.urn))
return -1; return -1;
if (rClass.equals("urn")) if (rClass.equals(PidType.urn))
return 1; return 1;
return 0; return 0;

View File

@ -0,0 +1,17 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import org.apache.commons.lang3.EnumUtils;
public enum PidType {
// Result
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
// Organization
GRID, mag_id, urn;
public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type);
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.*;
public class IdentifierFactoryTest {
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier("publication_urn.json", "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID);
verifyIdentifier("publication_4.json", defaultID);
verifyIdentifier("publication_5.json", defaultID);
}
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub);
assertNotNull(id);
assertEquals(expectedID, id);
}
}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]}