forked from D-Net/dnet-hadoop
IdentifierFactory: in case a record provides more than one pid of the same type, the the lexicographically lower value is chosen as best pick
This commit is contained in:
parent
fcbb05eb21
commit
c016cc050a
|
@ -2,7 +2,12 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -42,13 +47,27 @@ public class IdentifierFactory implements Serializable {
|
||||||
return entity.getId();
|
return entity.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
return entity
|
Map<String, List<StructuredProperty>> pids = entity
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> pidFilter(s))
|
.filter(s -> pidFilter(s))
|
||||||
|
.collect(
|
||||||
|
Collectors.groupingBy(p -> p.getQualifier().getClassid(),
|
||||||
|
Collectors.mapping(p -> p, Collectors.toList()))
|
||||||
|
);
|
||||||
|
|
||||||
|
return pids
|
||||||
|
.values()
|
||||||
|
.stream()
|
||||||
|
.flatMap(s -> s.stream())
|
||||||
.min(new PidComparator<>(entity))
|
.min(new PidComparator<>(entity))
|
||||||
|
.map(min -> Optional.ofNullable(pids.get(min.getQualifier().getClassid()))
|
||||||
|
.map(p -> p.stream()
|
||||||
|
.sorted(new PidValueComparator())
|
||||||
|
.findFirst()
|
||||||
.map(s -> idFromPid(entity, s))
|
.map(s -> idFromPid(entity, s))
|
||||||
.map(IdentifierFactory::verifyIdSyntax)
|
.orElseGet(entity::getId))
|
||||||
|
.orElseGet(entity::getId))
|
||||||
.orElseGet(entity::getId);
|
.orElseGet(entity::getId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,25 +1,31 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
public class OrganizationPidComparator implements Comparator<PidType> {
|
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(PidType pLeft, PidType pRight) {
|
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||||
if (pLeft.equals(PidType.GRID))
|
|
||||||
|
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||||
|
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||||
|
|
||||||
|
if (lClass.equals(PidType.GRID))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.GRID))
|
if (rClass.equals(PidType.GRID))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.mag_id))
|
if (lClass.equals(PidType.mag_id))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.mag_id))
|
if (rClass.equals(PidType.mag_id))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.urn))
|
if (lClass.equals(PidType.urn))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.urn))
|
if (rClass.equals(PidType.urn))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -27,28 +27,22 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
||||||
if (right == null)
|
if (right == null)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
|
||||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
|
||||||
|
|
||||||
if (lClass.equals(rClass))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||||
return compareResultPids(lClass, rClass);
|
return compareResultPids(left, right);
|
||||||
}
|
}
|
||||||
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
||||||
return compareOrganizationtPids(lClass, rClass);
|
return compareOrganizationtPids(left, right);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Else (but unlikely), lexicographical ordering will do.
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
return lClass.compareTo(rClass);
|
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
|
||||||
return new ResultPidComparator().compare(lClass, rClass);
|
return new ResultPidComparator().compare(left, right);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
|
||||||
return new OrganizationPidComparator().compare(lClass, rClass);
|
return new OrganizationPidComparator().compare(left, right);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||||
|
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return 1;
|
||||||
|
if (right == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
||||||
|
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
||||||
|
|
||||||
|
return Optional.ofNullable(l.getValue())
|
||||||
|
.map(lv -> Optional.ofNullable(r.getValue())
|
||||||
|
.map(rv -> lv.compareTo(rv))
|
||||||
|
.orElse(-1))
|
||||||
|
.orElse(1);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,55 +1,61 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
public class ResultPidComparator implements Comparator<PidType> {
|
public class ResultPidComparator implements Comparator<StructuredProperty> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(PidType pLeft, PidType pRight) {
|
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||||
if (pLeft.equals(PidType.doi))
|
|
||||||
|
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||||
|
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||||
|
|
||||||
|
if (lClass.equals(PidType.doi))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.doi))
|
if (rClass.equals(PidType.doi))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.pmid))
|
if (lClass.equals(PidType.pmid))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.pmid))
|
if (rClass.equals(PidType.pmid))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.pmc))
|
if (lClass.equals(PidType.pmc))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.pmc))
|
if (rClass.equals(PidType.pmc))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.handle))
|
if (lClass.equals(PidType.handle))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.handle))
|
if (rClass.equals(PidType.handle))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.arXiv))
|
if (lClass.equals(PidType.arXiv))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.arXiv))
|
if (rClass.equals(PidType.arXiv))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.NCID))
|
if (lClass.equals(PidType.NCID))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.NCID))
|
if (rClass.equals(PidType.NCID))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.GBIF))
|
if (lClass.equals(PidType.GBIF))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.GBIF))
|
if (rClass.equals(PidType.GBIF))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.nct))
|
if (lClass.equals(PidType.nct))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.nct))
|
if (rClass.equals(PidType.nct))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (pLeft.equals(PidType.urn))
|
if (lClass.equals(PidType.urn))
|
||||||
return -1;
|
return -1;
|
||||||
if (pRight.equals(PidType.urn))
|
if (rClass.equals(PidType.urn))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -22,10 +22,11 @@ public class IdentifierFactoryTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCreateIdentifierForPublication() throws IOException {
|
public void testCreateIdentifierForPublication() throws IOException {
|
||||||
|
|
||||||
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
verifyIdentifier("publication_doi1.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||||
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
verifyIdentifier("publication_doi2.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2010.03.013"));
|
||||||
|
verifyIdentifier("publication_pmc1.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||||
verifyIdentifier(
|
verifyIdentifier(
|
||||||
"publication_urn.json",
|
"publication_urn1.json",
|
||||||
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||||
|
|
||||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2010.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
Loading…
Reference in New Issue