forked from D-Net/dnet-hadoop
IdentifierFactory: in case a record provides more than one pid of the same type, the the lexicographically lower value is chosen as best pick
This commit is contained in:
parent
fcbb05eb21
commit
c016cc050a
|
@ -2,7 +2,12 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -42,14 +47,28 @@ public class IdentifierFactory implements Serializable {
|
|||
return entity.getId();
|
||||
}
|
||||
|
||||
return entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(s -> pidFilter(s))
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.map(IdentifierFactory::verifyIdSyntax)
|
||||
.orElseGet(entity::getId);
|
||||
Map<String, List<StructuredProperty>> pids = entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(s -> pidFilter(s))
|
||||
.collect(
|
||||
Collectors.groupingBy(p -> p.getQualifier().getClassid(),
|
||||
Collectors.mapping(p -> p, Collectors.toList()))
|
||||
);
|
||||
|
||||
return pids
|
||||
.values()
|
||||
.stream()
|
||||
.flatMap(s -> s.stream())
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(min -> Optional.ofNullable(pids.get(min.getQualifier().getClassid()))
|
||||
.map(p -> p.stream()
|
||||
.sorted(new PidValueComparator())
|
||||
.findFirst()
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
protected static boolean pidFilter(StructuredProperty s) {
|
||||
|
|
|
@ -1,25 +1,31 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class OrganizationPidComparator implements Comparator<PidType> {
|
||||
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.GRID))
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GRID))
|
||||
if (rClass.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.mag_id))
|
||||
if (lClass.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.mag_id))
|
||||
if (rClass.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -27,28 +27,22 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
|||
if (right == null)
|
||||
return -1;
|
||||
|
||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||
return compareResultPids(lClass, rClass);
|
||||
return compareResultPids(left, right);
|
||||
}
|
||||
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
||||
return compareOrganizationtPids(lClass, rClass);
|
||||
return compareOrganizationtPids(left, right);
|
||||
}
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
|
||||
}
|
||||
|
||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||
return new ResultPidComparator().compare(lClass, rClass);
|
||||
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
|
||||
return new ResultPidComparator().compare(left, right);
|
||||
}
|
||||
|
||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||
return new OrganizationPidComparator().compare(lClass, rClass);
|
||||
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
|
||||
return new OrganizationPidComparator().compare(left, right);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
||||
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
||||
|
||||
return Optional.ofNullable(l.getValue())
|
||||
.map(lv -> Optional.ofNullable(r.getValue())
|
||||
.map(rv -> lv.compareTo(rv))
|
||||
.orElse(-1))
|
||||
.orElse(1);
|
||||
}
|
||||
}
|
|
@ -1,55 +1,61 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class ResultPidComparator implements Comparator<PidType> {
|
||||
public class ResultPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.doi))
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(PidType.doi))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.doi))
|
||||
if (rClass.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmid))
|
||||
if (lClass.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmid))
|
||||
if (rClass.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmc))
|
||||
if (lClass.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmc))
|
||||
if (rClass.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.handle))
|
||||
if (lClass.equals(PidType.handle))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.handle))
|
||||
if (rClass.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.arXiv))
|
||||
if (lClass.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.arXiv))
|
||||
if (rClass.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.NCID))
|
||||
if (lClass.equals(PidType.NCID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.NCID))
|
||||
if (rClass.equals(PidType.NCID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.GBIF))
|
||||
if (lClass.equals(PidType.GBIF))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GBIF))
|
||||
if (rClass.equals(PidType.GBIF))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.nct))
|
||||
if (lClass.equals(PidType.nct))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.nct))
|
||||
if (rClass.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -22,10 +22,11 @@ public class IdentifierFactoryTest {
|
|||
@Test
|
||||
public void testCreateIdentifierForPublication() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
|
||||
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||
verifyIdentifier(
|
||||
"publication_urn.json",
|
||||
"publication_urn1.json",
|
||||
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
Loading…
Reference in New Issue