IdentifierFactory: in case a record provides more than one pid of the same type, the the lexicographically lower value is chosen as best pick

This commit is contained in:
Claudio Atzori 2020-11-23 19:16:40 +01:00
parent fcbb05eb21
commit c016cc050a
10 changed files with 114 additions and 52 deletions

View File

@ -2,7 +2,12 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils;
@ -42,13 +47,27 @@ public class IdentifierFactory implements Serializable {
return entity.getId();
}
return entity
Map<String, List<StructuredProperty>> pids = entity
.getPid()
.stream()
.filter(s -> pidFilter(s))
.collect(
Collectors.groupingBy(p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList()))
);
return pids
.values()
.stream()
.flatMap(s -> s.stream())
.min(new PidComparator<>(entity))
.map(min -> Optional.ofNullable(pids.get(min.getQualifier().getClassid()))
.map(p -> p.stream()
.sorted(new PidValueComparator())
.findFirst()
.map(s -> idFromPid(entity, s))
.map(IdentifierFactory::verifyIdSyntax)
.orElseGet(entity::getId))
.orElseGet(entity::getId))
.orElseGet(entity::getId);
}

View File

@ -1,25 +1,31 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> {
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.GRID))
public int compare(StructuredProperty left, StructuredProperty right) {
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.GRID))
return -1;
if (pRight.equals(PidType.GRID))
if (rClass.equals(PidType.GRID))
return 1;
if (pLeft.equals(PidType.mag_id))
if (lClass.equals(PidType.mag_id))
return -1;
if (pRight.equals(PidType.mag_id))
if (rClass.equals(PidType.mag_id))
return 1;
if (pLeft.equals(PidType.urn))
if (lClass.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
if (rClass.equals(PidType.urn))
return 1;
return 0;

View File

@ -27,28 +27,22 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
if (right == null)
return -1;
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (ModelSupport.isSubClass(entity, Result.class)) {
return compareResultPids(lClass, rClass);
return compareResultPids(left, right);
}
if (ModelSupport.isSubClass(entity, Organization.class)) {
return compareOrganizationtPids(lClass, rClass);
return compareOrganizationtPids(left, right);
}
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
}
private int compareResultPids(PidType lClass, PidType rClass) {
return new ResultPidComparator().compare(lClass, rClass);
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
return new ResultPidComparator().compare(left, right);
}
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
return new OrganizationPidComparator().compare(lClass, rClass);
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
return new OrganizationPidComparator().compare(left, right);
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
import java.util.Optional;
public class PidValueComparator implements Comparator<StructuredProperty> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional.ofNullable(l.getValue())
.map(lv -> Optional.ofNullable(r.getValue())
.map(rv -> lv.compareTo(rv))
.orElse(-1))
.orElse(1);
}
}

View File

@ -1,55 +1,61 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> {
public class ResultPidComparator implements Comparator<StructuredProperty> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.doi))
public int compare(StructuredProperty left, StructuredProperty right) {
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.doi))
return -1;
if (pRight.equals(PidType.doi))
if (rClass.equals(PidType.doi))
return 1;
if (pLeft.equals(PidType.pmid))
if (lClass.equals(PidType.pmid))
return -1;
if (pRight.equals(PidType.pmid))
if (rClass.equals(PidType.pmid))
return 1;
if (pLeft.equals(PidType.pmc))
if (lClass.equals(PidType.pmc))
return -1;
if (pRight.equals(PidType.pmc))
if (rClass.equals(PidType.pmc))
return 1;
if (pLeft.equals(PidType.handle))
if (lClass.equals(PidType.handle))
return -1;
if (pRight.equals(PidType.handle))
if (rClass.equals(PidType.handle))
return 1;
if (pLeft.equals(PidType.arXiv))
if (lClass.equals(PidType.arXiv))
return -1;
if (pRight.equals(PidType.arXiv))
if (rClass.equals(PidType.arXiv))
return 1;
if (pLeft.equals(PidType.NCID))
if (lClass.equals(PidType.NCID))
return -1;
if (pRight.equals(PidType.NCID))
if (rClass.equals(PidType.NCID))
return 1;
if (pLeft.equals(PidType.GBIF))
if (lClass.equals(PidType.GBIF))
return -1;
if (pRight.equals(PidType.GBIF))
if (rClass.equals(PidType.GBIF))
return 1;
if (pLeft.equals(PidType.nct))
if (lClass.equals(PidType.nct))
return -1;
if (pRight.equals(PidType.nct))
if (rClass.equals(PidType.nct))
return 1;
if (pLeft.equals(PidType.urn))
if (lClass.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
if (rClass.equals(PidType.urn))
return 1;
return 0;

View File

@ -22,10 +22,11 @@ public class IdentifierFactoryTest {
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier(
"publication_urn.json",
"publication_urn1.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}