IdentifierFactory: in case a record provides more than one pid of the same type, the the lexicographically lower value is chosen as best pick

This commit is contained in:
Claudio Atzori 2020-11-23 19:16:40 +01:00
parent fcbb05eb21
commit c016cc050a
10 changed files with 114 additions and 52 deletions

View File

@ -2,7 +2,12 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable; import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -42,14 +47,28 @@ public class IdentifierFactory implements Serializable {
return entity.getId(); return entity.getId();
} }
return entity Map<String, List<StructuredProperty>> pids = entity
.getPid() .getPid()
.stream() .stream()
.filter(s -> pidFilter(s)) .filter(s -> pidFilter(s))
.min(new PidComparator<>(entity)) .collect(
.map(s -> idFromPid(entity, s)) Collectors.groupingBy(p -> p.getQualifier().getClassid(),
.map(IdentifierFactory::verifyIdSyntax) Collectors.mapping(p -> p, Collectors.toList()))
.orElseGet(entity::getId); );
return pids
.values()
.stream()
.flatMap(s -> s.stream())
.min(new PidComparator<>(entity))
.map(min -> Optional.ofNullable(pids.get(min.getQualifier().getClassid()))
.map(p -> p.stream()
.sorted(new PidValueComparator())
.findFirst()
.map(s -> idFromPid(entity, s))
.orElseGet(entity::getId))
.orElseGet(entity::getId))
.orElseGet(entity::getId);
} }
protected static boolean pidFilter(StructuredProperty s) { protected static boolean pidFilter(StructuredProperty s) {

View File

@ -1,25 +1,31 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator; import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> { public class OrganizationPidComparator implements Comparator<StructuredProperty> {
@Override @Override
public int compare(PidType pLeft, PidType pRight) { public int compare(StructuredProperty left, StructuredProperty right) {
if (pLeft.equals(PidType.GRID))
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.GRID))
return -1; return -1;
if (pRight.equals(PidType.GRID)) if (rClass.equals(PidType.GRID))
return 1; return 1;
if (pLeft.equals(PidType.mag_id)) if (lClass.equals(PidType.mag_id))
return -1; return -1;
if (pRight.equals(PidType.mag_id)) if (rClass.equals(PidType.mag_id))
return 1; return 1;
if (pLeft.equals(PidType.urn)) if (lClass.equals(PidType.urn))
return -1; return -1;
if (pRight.equals(PidType.urn)) if (rClass.equals(PidType.urn))
return 1; return 1;
return 0; return 0;

View File

@ -27,28 +27,22 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
if (right == null) if (right == null)
return -1; return -1;
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (ModelSupport.isSubClass(entity, Result.class)) { if (ModelSupport.isSubClass(entity, Result.class)) {
return compareResultPids(lClass, rClass); return compareResultPids(left, right);
} }
if (ModelSupport.isSubClass(entity, Organization.class)) { if (ModelSupport.isSubClass(entity, Organization.class)) {
return compareOrganizationtPids(lClass, rClass); return compareOrganizationtPids(left, right);
} }
// Else (but unlikely), lexicographical ordering will do. // Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass); return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
} }
private int compareResultPids(PidType lClass, PidType rClass) { private int compareResultPids(StructuredProperty left, StructuredProperty right) {
return new ResultPidComparator().compare(lClass, rClass); return new ResultPidComparator().compare(left, right);
} }
private int compareOrganizationtPids(PidType lClass, PidType rClass) { private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
return new OrganizationPidComparator().compare(lClass, rClass); return new OrganizationPidComparator().compare(left, right);
} }
} }

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
import java.util.Optional;
public class PidValueComparator implements Comparator<StructuredProperty> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional.ofNullable(l.getValue())
.map(lv -> Optional.ofNullable(r.getValue())
.map(rv -> lv.compareTo(rv))
.orElse(-1))
.orElse(1);
}
}

View File

@ -1,55 +1,61 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator; import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> { public class ResultPidComparator implements Comparator<StructuredProperty> {
@Override @Override
public int compare(PidType pLeft, PidType pRight) { public int compare(StructuredProperty left, StructuredProperty right) {
if (pLeft.equals(PidType.doi))
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(PidType.doi))
return -1; return -1;
if (pRight.equals(PidType.doi)) if (rClass.equals(PidType.doi))
return 1; return 1;
if (pLeft.equals(PidType.pmid)) if (lClass.equals(PidType.pmid))
return -1; return -1;
if (pRight.equals(PidType.pmid)) if (rClass.equals(PidType.pmid))
return 1; return 1;
if (pLeft.equals(PidType.pmc)) if (lClass.equals(PidType.pmc))
return -1; return -1;
if (pRight.equals(PidType.pmc)) if (rClass.equals(PidType.pmc))
return 1; return 1;
if (pLeft.equals(PidType.handle)) if (lClass.equals(PidType.handle))
return -1; return -1;
if (pRight.equals(PidType.handle)) if (rClass.equals(PidType.handle))
return 1; return 1;
if (pLeft.equals(PidType.arXiv)) if (lClass.equals(PidType.arXiv))
return -1; return -1;
if (pRight.equals(PidType.arXiv)) if (rClass.equals(PidType.arXiv))
return 1; return 1;
if (pLeft.equals(PidType.NCID)) if (lClass.equals(PidType.NCID))
return -1; return -1;
if (pRight.equals(PidType.NCID)) if (rClass.equals(PidType.NCID))
return 1; return 1;
if (pLeft.equals(PidType.GBIF)) if (lClass.equals(PidType.GBIF))
return -1; return -1;
if (pRight.equals(PidType.GBIF)) if (rClass.equals(PidType.GBIF))
return 1; return 1;
if (pLeft.equals(PidType.nct)) if (lClass.equals(PidType.nct))
return -1; return -1;
if (pRight.equals(PidType.nct)) if (rClass.equals(PidType.nct))
return 1; return 1;
if (pLeft.equals(PidType.urn)) if (lClass.equals(PidType.urn))
return -1; return -1;
if (pRight.equals(PidType.urn)) if (rClass.equals(PidType.urn))
return 1; return 1;
return 0; return 0;

View File

@ -22,10 +22,11 @@ public class IdentifierFactoryTest {
@Test @Test
public void testCreateIdentifierForPublication() throws IOException { public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329")); verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier( verifyIdentifier(
"publication_urn.json", "publication_urn1.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2")); "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}