IdentifierFactory implementation based on the list of datasources authoritative for a given pid type

This commit is contained in:
Claudio Atzori 2021-03-09 17:11:50 +01:00
parent b3f3b895e5
commit 01630f638d
13 changed files with 289 additions and 58 deletions

View File

@ -14,6 +14,8 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"; public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10.)";
public static final String DOI_PREFIX = "10.";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
@ -263,6 +265,29 @@ public class CleaningFunctions {
classid, classname, scheme, scheme); classid, classname, scheme, scheme);
} }
/**
* Utility method that filter PID values on a per-type basis.
* @param pid the PID whose value will be checked.
* @return true the PID containing the normalised value.
*/
private static boolean filterPid(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(s -> StringUtils.replaceAll(s, "\\s", ""))
.orElse("");
if (StringUtils.isBlank(value)) {
return false;
}
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
return value.startsWith(DOI_PREFIX);
default:
return true;
}
}
/** /**
* Utility method that normalises PID values on a per-type basis. * Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised. * @param pid the PID whose value will be normalised.
@ -277,7 +302,7 @@ public class CleaningFunctions {
// TODO add cleaning for more PID types as needed // TODO add cleaning for more PID types as needed
case "doi": case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10.")); pid.setValue(value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX));
break; break;
} }
return pid; return pid;

View File

@ -9,9 +9,9 @@ import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;

View File

@ -1,23 +1,23 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.*;
import java.util.Map; import java.util.function.Function;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.HashBiMap; import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
/** /**
@ -35,44 +35,37 @@ public class IdentifierFactory implements Serializable {
public static final int ID_PREFIX_LEN = 12; public static final int ID_PREFIX_LEN = 12;
public static final HashBiMap<String, String> PID_AUTHORITY = HashBiMap.create(2); /**
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID
*/
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
static { static {
PID_AUTHORITY.put(ModelConstants.CROSSREF_ID, "Crossref"); PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
PID_AUTHORITY.put(ModelConstants.DATACITE_ID, "Datacite"); PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central");
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create());
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central");
} }
/** /**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
* when no PID is available * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
*
* @param entity the entity providing PIDs and a default ID. * @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported. * @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @param md5 indicates whether should hash the PID value or not. * @param md5 indicates whether should hash the PID value or not.
* @return an identifier from the most relevant PID, entity.id otherwise * @return an identifier from the most relevant PID, entity.id otherwise
*/ */
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) { public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
return entity.getId();
}
if (Optional.ofNullable( final Map<String, List<StructuredProperty>> pids = extractPids(entity);
entity.getCollectedfrom())
.map(c -> c.stream()
.noneMatch(cf -> PID_AUTHORITY.containsKey(cf.getKey()) || PID_AUTHORITY.containsValue(cf.getValue())))
.orElse(true)) {
return entity.getId();
}
Map<String, List<StructuredProperty>> pids = entity
.getPid()
.stream()
.map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter)
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList())));
return pids return pids
.values() .values()
@ -93,6 +86,57 @@ public class IdentifierFactory implements Serializable {
.orElseGet(entity::getId); .orElseGet(entity::getId);
} }
private static <T extends OafEntity> Map<String, List<StructuredProperty>> extractPids(T entity) {
if (entity instanceof Result) {
return Optional
.ofNullable(((Result) entity).getInstance())
.map(
instance -> instance
.stream()
.map(
i -> Optional
.ofNullable(i.getPid())
.map(
pp -> pp
.stream()
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
return Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
final KeyValue cf = i.getCollectedfrom();
return authorities.containsKey(cf.getKey())
|| authorities.containsValue(cf.getValue());
})
.orElse(false);
})
.map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter))
.orElse(Stream.empty()))
.flatMap(Function.identity())
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList()))))
.orElse(new HashMap<>());
} else {
return entity
.getPid()
.stream()
.map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter)
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList())));
}
}
/** /**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/ */

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Set;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -12,6 +14,8 @@ public class BlackListProviderTest {
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist()); Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi")); Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0); Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
Assertions.assertNull(PidBlacklistProvider.getBlacklist("xxx")); final Set<String> xxx = PidBlacklistProvider.getBlacklist("xxx");
Assertions.assertNotNull(xxx);
Assertions.assertEquals(0, xxx.size());
} }
} }

View File

@ -23,28 +23,35 @@ public class IdentifierFactoryTest {
public void testCreateIdentifierForPublication() throws IOException { public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier( verifyIdentifier(
"publication_doi1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false); "publication_doi1.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier( verifyIdentifier(
"publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"), true); "publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"), true);
verifyIdentifier( verifyIdentifier(
"publication_urn1.json", "publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"), true);
verifyIdentifier(
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, true); verifyIdentifier("publication_3.json", defaultID, true);
verifyIdentifier("publication_4.json", defaultID, true); verifyIdentifier("publication_4.json", defaultID, true);
verifyIdentifier("publication_5.json", defaultID, true); verifyIdentifier("publication_5.json", defaultID, true);
} }
@Test @Test
public void testCreateIdentifierForPublicationNoHash() throws IOException { public void testCreateIdentifierForPublicationNoHash() throws IOException {
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2011.03.013", false); verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false); verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
verifyIdentifier("publication_pmc1.json", "50|pmc_________::21459329", false); verifyIdentifier("publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
verifyIdentifier( verifyIdentifier(
"publication_urn1.json", "50|urn_________::urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2", false); "publication_urn1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, false); verifyIdentifier("publication_3.json", defaultID, false);

View File

@ -1 +1,33 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[ {"qualifier":{"classid":"doi"},"value":"10.12739/10.12739"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]} {
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
"value": "Crossref"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -1 +1,37 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}],"collectedfrom":[{"key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","value":"Crossref"}]} {
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
"value": "Crossref"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -0,0 +1,37 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|openaire____::1234",
"value": "Zenodo"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
}
]
},
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
]
}
]
}

View File

@ -0,0 +1,21 @@
{
"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"instance": [
{
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central"
},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2010.03.013"
},
{
"qualifier":{"classid":"pmc"},
"value":"21459329"
}
]
}
]
}

View File

@ -1 +1,23 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]} {
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
"pid": [
{
"qualifier": {
"classid": "urn"
},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {
"classid": "scp-number"
},
"value": "79953761260"
},
{
"qualifier": {
"classid": "pmcid"
},
"value": "21459329"
}
]
}

View File

@ -14,6 +14,9 @@ public class ModelConstants {
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"; public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"; public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String EUROPE_PUBMED_CENTRAL_ID = "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c";
public static String PUBMED_CENTRAL_ID = "10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357";
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource"; public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";

View File

@ -236,11 +236,11 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected Relation getRelation(final String source, protected Relation getRelation(final String source,
final String target, final String target,
final String relType, final String relType,
final String subRelType, final String subRelType,
final String relClass, final String relClass,
final OafEntity entity) { final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null); return getRelation(source, target, relType, subRelType, relClass, entity, null);
} }
@ -250,7 +250,7 @@ public abstract class AbstractMdRecordToOafMapper {
final String subRelType, final String subRelType,
final String relClass, final String relClass,
final OafEntity entity, final OafEntity entity,
final String validationDate) { final String validationDate) {
final Relation rel = new Relation(); final Relation rel = new Relation();
rel.setRelType(relType); rel.setRelType(relType);
rel.setSubRelType(subRelType); rel.setSubRelType(subRelType);

View File

@ -71,7 +71,7 @@ public class MappersTest {
assertValidId(p.getId()); assertValidId(p.getId());
assertEquals(2, p.getOriginalId().size()); assertEquals(1, p.getOriginalId().size());
assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718")); assertTrue(p.getOriginalId().contains("10.3897/oneeco.2.e13718"));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
@ -186,7 +186,7 @@ public class MappersTest {
final Relation r2 = (Relation) list.get(2); final Relation r2 = (Relation) list.get(2);
assertValidId(d.getId()); assertValidId(d.getId());
assertEquals(2, d.getOriginalId().size()); assertEquals(1, d.getOriginalId().size());
assertTrue(d.getOriginalId().contains("oai:zenodo.org:3234526")); assertTrue(d.getOriginalId().contains("oai:zenodo.org:3234526"));
assertValidId(d.getCollectedfrom().get(0).getKey()); assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));