dedup workflow using the common PidComparator

This commit is contained in:
Claudio Atzori 2020-11-04 15:02:02 +01:00
parent ea2a0ea949
commit e5da4ee9b1
17 changed files with 304 additions and 405 deletions

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.GRID))
return -1;
if (pRight.equals(PidType.GRID))
return 1;
if (pLeft.equals(PidType.mag_id))
return -1;
if (pRight.equals(PidType.mag_id))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -45,70 +45,10 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
} }
private int compareResultPids(PidType lClass, PidType rClass) { private int compareResultPids(PidType lClass, PidType rClass) {
if (lClass.equals(PidType.doi)) return new ResultPidComparator().compare(lClass, rClass);
return -1;
if (rClass.equals(PidType.doi))
return 1;
if (lClass.equals(PidType.pmid))
return -1;
if (rClass.equals(PidType.pmid))
return 1;
if (lClass.equals(PidType.pmc))
return -1;
if (rClass.equals(PidType.pmc))
return 1;
if (lClass.equals(PidType.handle))
return -1;
if (rClass.equals(PidType.handle))
return 1;
if (lClass.equals(PidType.arXiv))
return -1;
if (rClass.equals(PidType.arXiv))
return 1;
if (lClass.equals(PidType.NCID))
return -1;
if (rClass.equals(PidType.NCID))
return 1;
if (lClass.equals(PidType.GBIF))
return -1;
if (rClass.equals(PidType.GBIF))
return 1;
if (lClass.equals(PidType.nct))
return -1;
if (rClass.equals(PidType.nct))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1;
return 0;
} }
private int compareOrganizationtPids(PidType lClass, PidType rClass) { private int compareOrganizationtPids(PidType lClass, PidType rClass) {
if (lClass.equals(PidType.GRID)) return new OrganizationPidComparator().compare(lClass, rClass);
return -1;
if (rClass.equals(PidType.GRID))
return 1;
if (lClass.equals(PidType.mag_id))
return -1;
if (rClass.equals(PidType.mag_id))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1;
return 0;
} }
} }

View File

@ -9,10 +9,21 @@ public enum PidType {
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb, doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
// Organization // Organization
GRID, mag_id, urn; GRID, mag_id, urn,
// Used by dedup
undefined, original;
public static boolean isValid(String type) { public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type); return EnumUtils.isValidEnum(PidType.class, type);
} }
public static PidType tryValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.original;
}
}
} }

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.doi))
return -1;
if (pRight.equals(PidType.doi))
return 1;
if (pLeft.equals(PidType.pmid))
return -1;
if (pRight.equals(PidType.pmid))
return 1;
if (pLeft.equals(PidType.pmc))
return -1;
if (pRight.equals(PidType.pmc))
return 1;
if (pLeft.equals(PidType.handle))
return -1;
if (pRight.equals(PidType.handle))
return 1;
if (pLeft.equals(PidType.arXiv))
return -1;
if (pRight.equals(PidType.arXiv))
return 1;
if (pLeft.equals(PidType.NCID))
return -1;
if (pRight.equals(PidType.NCID))
return 1;
if (pLeft.equals(PidType.GBIF))
return -1;
if (pRight.equals(PidType.GBIF))
return 1;
if (pLeft.equals(PidType.nct))
return -1;
if (pRight.equals(PidType.nct))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -82,7 +82,7 @@ public class DedupRecordFactory {
final Collection<String> dates = Lists.newArrayList(); final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList(); final List<List<Author>> authors = Lists.newArrayList();
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
entities entities
.forEachRemaining( .forEachRemaining(
@ -90,7 +90,7 @@ public class DedupRecordFactory {
T duplicate = t._2(); T duplicate = t._2();
// prepare the list of pids to use for the id generation // prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate)); bestPids.add(Identifier.newInstance(duplicate));
entity.mergeFrom(duplicate); entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {

View File

@ -1,124 +1,46 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException; import java.util.List;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class IdGenerator implements Serializable { public class IdGenerator implements Serializable {
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String BASE_DATE = "2000-01-01";
// pick the best pid from the list (consider date and pidtype) // pick the best pid from the list (consider date and pidtype)
public static String generate(List<Identifier> pids, String defaultID) { public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
if (pids == null || pids.size() == 0) if (pids == null || pids.size() == 0)
return defaultID; return defaultID;
Optional<Identifier> bp = pids Identifier<T> bp = pids
.stream() .stream()
.max(Identifier::compareTo); .min(Identifier::compareTo)
.get();
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) { String prefix = substringBefore(bp.getOriginalID(), "|");
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
+ DHPUtils.md5(bp.get().getOriginalID()); String suffix = substringAfter(bp.getOriginalID(), "::");
final String pidType = substringBefore(ns, "_");
if (PidType.isValid(pidType)) {
return prefix + "|" + dedupify(ns) + "::" + suffix;
} else { } else {
return bp.get().getOriginalID().split("\\|")[0] + "|" return prefix + "|dedup_wf_001::" + suffix;
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" }
+ DHPUtils.md5(bp.get().getPid().getValue());
} }
} private static String dedupify(String ns) {
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
Date date;
try {
date = sdf.parse(BASE_DATE);
} catch (ParseException e) {
date = new Date();
}
return Lists
.newArrayList(
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
}
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
if (entity.getPid() == null || entity.getPid().size() == 0)
return createBasePid(entity, sdf);
Optional<StructuredProperty> bp = entity
.getPid()
.stream()
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
return bp
.map(
structuredProperty -> Lists
.newArrayList(
new Identifier(structuredProperty, extractDate(entity, sdf),
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
.orElseGet(() -> createBasePid(entity, sdf));
}
// create the prefix (length = 12): dedup_+ pidType
public static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
while (prefix.length() < 12) { while (prefix.length() < 12) {
prefix.append("_"); prefix.append("_");
} }
return prefix.toString().substring(0, 12); return prefix.substring(0, 12);
} }
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
// 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
String date = BASE_DATE;
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
}
public static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue())
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
}
} }

View File

@ -2,94 +2,85 @@
package eu.dnetlib.dhp.oa.dedup.model; package eu.dnetlib.dhp.oa.dedup.model;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.IdGenerator; import eu.dnetlib.dhp.oa.dedup.DatePicker;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class Identifier implements Serializable, Comparable<Identifier> { public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
StructuredProperty pid; public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
Date date; public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
PidType type; public static String BASE_DATE = "2000-01-01";
List<KeyValue> collectedFrom;
EntityType entityType;
String originalID;
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// ID
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, private T entity;
EntityType entityType, String originalID) {
this.pid = pid; public static <T extends OafEntity> Identifier newInstance(T entity) {
this.date = date; return new Identifier(entity);
this.type = type;
this.collectedFrom = collectedFrom;
this.entityType = entityType;
this.originalID = originalID;
} }
public StructuredProperty getPid() { public Identifier(T entity) {
return pid; this.entity = entity;
} }
public void setPid(StructuredProperty pid) { public T getEntity() {
this.pid = pid; return entity;
}
public void setEntity(T entity) {
this.entity = entity;
} }
public Date getDate() { public Date getDate() {
return date; String date = BASE_DATE;
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
Result result = (Result) getEntity();
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
} }
public void setDate(Date date) { private static boolean isWellformed(Field<String> date) {
this.date = date; return date != null && StringUtils.isNotBlank(date.getValue())
} && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
public PidType getType() {
return type;
}
public void setType(PidType type) {
this.type = type;
} }
public List<KeyValue> getCollectedFrom() { public List<KeyValue> getCollectedFrom() {
return collectedFrom; return entity.getCollectedfrom();
}
public void setCollectedFrom(List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
} }
public EntityType getEntityType() { public EntityType getEntityType() {
return entityType; return EntityType.fromClass(entity.getClass());
}
public void setEntityType(EntityType entityType) {
this.entityType = entityType;
} }
public String getOriginalID() { public String getOriginalID() {
return originalID; return entity.getId();
} }
public void setOriginalID(String originalID) { private PidType getPidType() {
this.originalID = originalID; return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
}
public boolean isUseOriginal() {
return useOriginal;
}
public void setUseOriginal(boolean useOriginal) {
this.useOriginal = useOriginal;
} }
@Override @Override
@ -97,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) // priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID // alphabetical order of the originalID
Set<String> lKeys = Sets.newHashSet(); Set<String> lKeys = Optional
if (this.collectedFrom != null) .ofNullable(getCollectedFrom())
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet()); .map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
Set<String> rKeys = Sets.newHashSet(); final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
if (i.getCollectedFrom() != null) Set<String> rKeys = cf
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet()); .map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (this.getType().compareTo(i.getType()) == 0) { // same type if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (entityType == EntityType.publication) { if (getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID) if (isFromDatasourceID(lKeys, CROSSREF_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)) && !isFromDatasourceID(rKeys, CROSSREF_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
return -1; return -1;
if (isFromDatasourceID(rKeys, CROSSREF_ID)
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
return 1;
} }
if (entityType == EntityType.dataset) { if (getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID) if (isFromDatasourceID(lKeys, DATACITE_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)) && !isFromDatasourceID(rKeys, DATACITE_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
return -1; return -1;
if (isFromDatasourceID(rKeys, DATACITE_ID)
&& !isFromDatasourceID(lKeys, DATACITE_ID))
return 1;
} }
if (this.getDate().compareTo(i.getDate()) == 0) {// same date if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) < 0)
this.useOriginal = true;
else
i.setUseOriginal(true);
// the minus because we need to take the alphabetically lower id // the minus because we need to take the alphabetically lower id
return -this.originalID.compareTo(i.originalID); return this.getOriginalID().compareTo(i.getOriginalID());
} else } else
// the minus is because we need to take the elder date // the minus is because we need to take the elder date
return -this.getDate().compareTo(i.getDate()); return this.getDate().compareTo(i.getDate());
} else { } else {
return this.getType().compareTo(i.getType()); return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
} }
} }
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) { public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId); return collectedFrom.contains(dsId);
} }

View File

@ -1,17 +0,0 @@
package eu.dnetlib.dhp.oa.dedup.model;
public enum PidType {
// from the less to the more important
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
public static PidType classidValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.undefined;
}
}
}

View File

@ -9,6 +9,7 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -21,16 +22,16 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable { public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications; private List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2; private List<Tuple2<String, Publication>> publications2;
List<Tuple2<String, Publication>> publications3; private List<Tuple2<String, Publication>> publications3;
List<Tuple2<String, Publication>> publications4; private List<Tuple2<String, Publication>> publications4;
List<Tuple2<String, Publication>> publications5; private List<Tuple2<String, Publication>> publications5;
String testEntityBasePath; private String testEntityBasePath;
DataInfo dataInfo; private DataInfo dataInfo;
String dedupId = "00|dedup_id::1"; private String dedupId = "00|dedup_id::1";
Publication pub_top; private Publication pub_top;
@BeforeEach @BeforeEach
public void setUp() throws Exception { public void setUp() throws Exception {
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
Software merged = DedupRecordFactory Software merged = DedupRecordFactory
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE"); assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340"); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
} }
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340"); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
assertEquals(pub_merged.getJournal(), pub_top.getJournal()); assertEquals(pub_top.getJournal(), pub_merged.getJournal());
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN"); assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype()); assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage()); assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher()); assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate()); assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
assertEquals(pub_merged.getResourcetype().getClassid(), "0004"); assertEquals(pub_top.getResourcetype().getClassid(), "");
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation()); assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance()); assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection()); assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
assertEquals(pub_merged.getInstance().size(), 3); assertEquals(3, pub_merged.getInstance().size());
assertEquals(pub_merged.getCountry().size(), 2); assertEquals(2, pub_merged.getCountry().size());
assertEquals(pub_merged.getSubject().size(), 0); assertEquals(0, pub_merged.getSubject().size());
assertEquals(pub_merged.getTitle().size(), 2); assertEquals(2, pub_merged.getTitle().size());
assertEquals(pub_merged.getRelevantdate().size(), 0); assertEquals(0, pub_merged.getRelevantdate().size());
assertEquals(pub_merged.getDescription().size(), 0); assertEquals(0, pub_merged.getDescription().size());
assertEquals(pub_merged.getSource().size(), 0); assertEquals(0, pub_merged.getSource().size());
assertEquals(pub_merged.getFulltext().size(), 0); assertEquals(0, pub_merged.getFulltext().size());
assertEquals(pub_merged.getFormat().size(), 0); assertEquals(0, pub_merged.getFormat().size());
assertEquals(pub_merged.getContributor().size(), 0); assertEquals(0, pub_merged.getContributor().size());
assertEquals(pub_merged.getCoverage().size(), 0); assertEquals(0, pub_merged.getCoverage().size());
assertEquals(pub_merged.getContext().size(), 0); assertEquals(0, pub_merged.getContext().size());
assertEquals(pub_merged.getExternalReference().size(), 0); assertEquals(0, pub_merged.getExternalReference().size());
assertEquals(pub_merged.getOriginalId().size(), 3); assertEquals(3, pub_merged.getOriginalId().size());
assertEquals(pub_merged.getCollectedfrom().size(), 3); assertEquals(3, pub_merged.getCollectedfrom().size());
assertEquals(pub_merged.getPid().size(), 1); assertEquals(1, pub_merged.getPid().size());
assertEquals(pub_merged.getExtraInfo().size(), 0); assertEquals(0, pub_merged.getExtraInfo().size());
// verify datainfo // verify datainfo
assertEquals(pub_merged.getDataInfo(), dataInfo); assertEquals(dataInfo, pub_merged.getDataInfo());
// verify datepicker // verify datepicker
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30"); assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
// verify authors // verify authors
assertEquals(pub_merged.getAuthor().size(), 9); assertEquals(9, pub_merged.getAuthor().size());
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
// verify title // verify title
int count = 0; int count = 0;
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
if (title.getQualifier().getClassid().equals("main title")) if (title.getQualifier().getClassid().equals("main title"))
count++; count++;
} }
assertEquals(count, 1); assertEquals(1, count);
} }
@Test @Test
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId()); assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
assertEquals(pub_merged.getAuthor().size(), 27); assertEquals(27, pub_merged.getAuthor().size());
} }
@Test @Test
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId()); assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
@Test @Test
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId()); assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
@Test @Test
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException { public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
System.out
.println(
publications5
.stream()
.map(p -> p._2().getId())
.collect(Collectors.toList()));
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId()); assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
public DataInfo setDI() { public DataInfo setDI() {

View File

@ -7,97 +7,57 @@ import java.io.BufferedReader;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.MethodOrderer;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestMethodOrder;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class IdGeneratorTest { public class IdGeneratorTest {
private static List<Identifier> bestIds; private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
private static List<Tuple2<String, Publication>> pubs; .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
private static List<Identifier> bestIds2; private static List<Identifier<Publication>> bestIds;
private static List<Identifier> bestIds3; private static List<Identifier<Publication>> bestIds2;
private static List<Identifier<Publication>> bestIds3;
private static String testEntityBasePath; private static String testEntityBasePath;
private static SimpleDateFormat sdf;
private static Date baseDate;
@BeforeAll @BeforeAll
public static void setUp() throws Exception { public static void setUp() throws Exception {
sdf = new SimpleDateFormat("yyyy-MM-dd");
baseDate = sdf.parse("2000-01-01");
bestIds = new ArrayList<>();
bestIds2 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
bestIds3 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
testEntityBasePath = Paths testEntityBasePath = Paths
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
.toFile() .toFile()
.getAbsolutePath(); .getAbsolutePath();
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class); bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
} }
@Test @Test
@Order(1)
public void bestPidToIdentifierTest() {
List<String> typesForAssertions = Lists
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
for (Tuple2<String, Publication> pub : pubs) {
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
bestIds.addAll(ids);
}
}
@Test
@Order(2)
public void generateIdTest1() { public void generateIdTest1() {
String id1 = IdGenerator.generate(bestIds, "50|defaultID"); String id1 = IdGenerator.generate(bestIds, "50|defaultID");
System.out System.out
.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); .println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
} }
@Test @Test
@ -106,14 +66,22 @@ public class IdGeneratorTest {
String id2 = IdGenerator.generate(bestIds3, "50|defaultID"); String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
System.out System.out
.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); .println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 2 = " + id1); System.out.println("winner 2 = " + id1);
System.out System.out
.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); .println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 3 = " + id2); System.out.println("winner 3 = " + id2);
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1); assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2); assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
}
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
final Stream<Identifier<T>> ids = readSample(path, clazz)
.stream()
.map(Tuple2::_2)
.map(Identifier::newInstance);
return ids.collect(Collectors.toList());
} }
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) { public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
@ -127,7 +95,7 @@ public class IdGeneratorTest {
.add( .add(
new Tuple2<>( new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line), MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz))); OBJECT_MAPPER.readValue(line, clazz)));
// read next line // read next line
line = reader.readLine(); line = reader.readLine();
} }
@ -140,23 +108,10 @@ public class IdGeneratorTest {
} }
public static StructuredProperty pid(String pid, String classid, String classname) { public static StructuredProperty pid(String pid, String classid, String classname) {
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
StructuredProperty sp = new StructuredProperty();
sp.setValue(pid);
Qualifier q = new Qualifier();
q.setSchemeid(classid);
q.setSchemename(classname);
q.setClassname(classname);
q.setClassid(classid);
sp.setQualifier(q);
return sp;
} }
public static List<KeyValue> keyValue(String key, String value) { public static List<KeyValue> keyValue(String key, String value) {
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
KeyValue kv = new KeyValue();
kv.setKey(key);
kv.setValue(value);
return Lists.newArrayList(kv);
} }
} }

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }