dedup workflow using the common PidComparator

This commit is contained in:
Claudio Atzori 2020-11-04 15:02:02 +01:00
parent ea2a0ea949
commit e5da4ee9b1
17 changed files with 304 additions and 405 deletions

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.GRID))
return -1;
if (pRight.equals(PidType.GRID))
return 1;
if (pLeft.equals(PidType.mag_id))
return -1;
if (pRight.equals(PidType.mag_id))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -45,70 +45,10 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
}
private int compareResultPids(PidType lClass, PidType rClass) {
if (lClass.equals(PidType.doi))
return -1;
if (rClass.equals(PidType.doi))
return 1;
if (lClass.equals(PidType.pmid))
return -1;
if (rClass.equals(PidType.pmid))
return 1;
if (lClass.equals(PidType.pmc))
return -1;
if (rClass.equals(PidType.pmc))
return 1;
if (lClass.equals(PidType.handle))
return -1;
if (rClass.equals(PidType.handle))
return 1;
if (lClass.equals(PidType.arXiv))
return -1;
if (rClass.equals(PidType.arXiv))
return 1;
if (lClass.equals(PidType.NCID))
return -1;
if (rClass.equals(PidType.NCID))
return 1;
if (lClass.equals(PidType.GBIF))
return -1;
if (rClass.equals(PidType.GBIF))
return 1;
if (lClass.equals(PidType.nct))
return -1;
if (rClass.equals(PidType.nct))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1;
return 0;
return new ResultPidComparator().compare(lClass, rClass);
}
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
if (lClass.equals(PidType.GRID))
return -1;
if (rClass.equals(PidType.GRID))
return 1;
if (lClass.equals(PidType.mag_id))
return -1;
if (rClass.equals(PidType.mag_id))
return 1;
if (lClass.equals(PidType.urn))
return -1;
if (rClass.equals(PidType.urn))
return 1;
return 0;
return new OrganizationPidComparator().compare(lClass, rClass);
}
}

View File

@ -9,10 +9,21 @@ public enum PidType {
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
// Organization
GRID, mag_id, urn;
GRID, mag_id, urn,
// Used by dedup
undefined, original;
public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type);
}
public static PidType tryValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.original;
}
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.doi))
return -1;
if (pRight.equals(PidType.doi))
return 1;
if (pLeft.equals(PidType.pmid))
return -1;
if (pRight.equals(PidType.pmid))
return 1;
if (pLeft.equals(PidType.pmc))
return -1;
if (pRight.equals(PidType.pmc))
return 1;
if (pLeft.equals(PidType.handle))
return -1;
if (pRight.equals(PidType.handle))
return 1;
if (pLeft.equals(PidType.arXiv))
return -1;
if (pRight.equals(PidType.arXiv))
return 1;
if (pLeft.equals(PidType.NCID))
return -1;
if (pRight.equals(PidType.NCID))
return 1;
if (pLeft.equals(PidType.GBIF))
return -1;
if (pRight.equals(PidType.GBIF))
return 1;
if (pLeft.equals(PidType.nct))
return -1;
if (pRight.equals(PidType.nct))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -82,7 +82,7 @@ public class DedupRecordFactory {
final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList();
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
entities
.forEachRemaining(
@ -90,7 +90,7 @@ public class DedupRecordFactory {
T duplicate = t._2();
// prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
bestPids.add(Identifier.newInstance(duplicate));
entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) {

View File

@ -1,124 +1,46 @@
package eu.dnetlib.dhp.oa.dedup;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists;
import java.util.List;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class IdGenerator implements Serializable {
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String BASE_DATE = "2000-01-01";
// pick the best pid from the list (consider date and pidtype)
public static String generate(List<Identifier> pids, String defaultID) {
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
if (pids == null || pids.size() == 0)
return defaultID;
Optional<Identifier> bp = pids
Identifier<T> bp = pids
.stream()
.max(Identifier::compareTo);
.min(Identifier::compareTo)
.get();
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
+ DHPUtils.md5(bp.get().getOriginalID());
String prefix = substringBefore(bp.getOriginalID(), "|");
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
String suffix = substringAfter(bp.getOriginalID(), "::");
final String pidType = substringBefore(ns, "_");
if (PidType.isValid(pidType)) {
return prefix + "|" + dedupify(ns) + "::" + suffix;
} else {
return bp.get().getOriginalID().split("\\|")[0] + "|"
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
+ DHPUtils.md5(bp.get().getPid().getValue());
return prefix + "|dedup_wf_001::" + suffix;
}
}
}
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
Date date;
try {
date = sdf.parse(BASE_DATE);
} catch (ParseException e) {
date = new Date();
}
return Lists
.newArrayList(
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
}
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
if (entity.getPid() == null || entity.getPid().size() == 0)
return createBasePid(entity, sdf);
Optional<StructuredProperty> bp = entity
.getPid()
.stream()
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
return bp
.map(
structuredProperty -> Lists
.newArrayList(
new Identifier(structuredProperty, extractDate(entity, sdf),
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
.orElseGet(() -> createBasePid(entity, sdf));
}
// create the prefix (length = 12): dedup_+ pidType
public static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
private static String dedupify(String ns) {
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
while (prefix.length() < 12) {
prefix.append("_");
}
return prefix.toString().substring(0, 12);
return prefix.substring(0, 12);
}
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
// 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
String date = BASE_DATE;
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
}
public static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue())
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
}
}

View File

@ -2,94 +2,85 @@
package eu.dnetlib.dhp.oa.dedup.model;
import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
import eu.dnetlib.dhp.oa.dedup.DatePicker;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class Identifier implements Serializable, Comparable<Identifier> {
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
StructuredProperty pid;
Date date;
PidType type;
List<KeyValue> collectedFrom;
EntityType entityType;
String originalID;
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String BASE_DATE = "2000-01-01";
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
// ID
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
EntityType entityType, String originalID) {
this.pid = pid;
this.date = date;
this.type = type;
this.collectedFrom = collectedFrom;
this.entityType = entityType;
this.originalID = originalID;
private T entity;
public static <T extends OafEntity> Identifier newInstance(T entity) {
return new Identifier(entity);
}
public StructuredProperty getPid() {
return pid;
public Identifier(T entity) {
this.entity = entity;
}
public void setPid(StructuredProperty pid) {
this.pid = pid;
public T getEntity() {
return entity;
}
public void setEntity(T entity) {
this.entity = entity;
}
public Date getDate() {
return date;
String date = BASE_DATE;
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
Result result = (Result) getEntity();
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
}
public void setDate(Date date) {
this.date = date;
}
public PidType getType() {
return type;
}
public void setType(PidType type) {
this.type = type;
private static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue())
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
}
public List<KeyValue> getCollectedFrom() {
return collectedFrom;
}
public void setCollectedFrom(List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
return entity.getCollectedfrom();
}
public EntityType getEntityType() {
return entityType;
}
public void setEntityType(EntityType entityType) {
this.entityType = entityType;
return EntityType.fromClass(entity.getClass());
}
public String getOriginalID() {
return originalID;
return entity.getId();
}
public void setOriginalID(String originalID) {
this.originalID = originalID;
}
public boolean isUseOriginal() {
return useOriginal;
}
public void setUseOriginal(boolean useOriginal) {
this.useOriginal = useOriginal;
private PidType getPidType() {
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
}
@Override
@ -97,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
Set<String> lKeys = Sets.newHashSet();
if (this.collectedFrom != null)
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
Set<String> lKeys = Optional
.ofNullable(getCollectedFrom())
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
Set<String> rKeys = Sets.newHashSet();
if (i.getCollectedFrom() != null)
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
Set<String> rKeys = cf
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (this.getType().compareTo(i.getType()) == 0) { // same type
if (entityType == EntityType.publication) {
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, CROSSREF_ID)
&& !isFromDatasourceID(rKeys, CROSSREF_ID))
return -1;
if (isFromDatasourceID(rKeys, CROSSREF_ID)
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
return 1;
}
if (entityType == EntityType.dataset) {
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
if (getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, DATACITE_ID)
&& !isFromDatasourceID(rKeys, DATACITE_ID))
return -1;
if (isFromDatasourceID(rKeys, DATACITE_ID)
&& !isFromDatasourceID(lKeys, DATACITE_ID))
return 1;
}
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) < 0)
this.useOriginal = true;
else
i.setUseOriginal(true);
// the minus because we need to take the alphabetically lower id
return -this.originalID.compareTo(i.originalID);
return this.getOriginalID().compareTo(i.getOriginalID());
} else
// the minus is because we need to take the elder date
return -this.getDate().compareTo(i.getDate());
return this.getDate().compareTo(i.getDate());
} else {
return this.getType().compareTo(i.getType());
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
}
}
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
}

View File

@ -1,17 +0,0 @@
package eu.dnetlib.dhp.oa.dedup.model;
public enum PidType {
// from the less to the more important
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
public static PidType classidValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.undefined;
}
}
}

View File

@ -9,6 +9,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach;
@ -21,16 +22,16 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2;
List<Tuple2<String, Publication>> publications3;
List<Tuple2<String, Publication>> publications4;
List<Tuple2<String, Publication>> publications5;
private List<Tuple2<String, Publication>> publications;
private List<Tuple2<String, Publication>> publications2;
private List<Tuple2<String, Publication>> publications3;
private List<Tuple2<String, Publication>> publications4;
private List<Tuple2<String, Publication>> publications5;
String testEntityBasePath;
DataInfo dataInfo;
String dedupId = "00|dedup_id::1";
Publication pub_top;
private String testEntityBasePath;
private DataInfo dataInfo;
private String dedupId = "00|dedup_id::1";
private Publication pub_top;
@BeforeEach
public void setUp() throws Exception {
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
Software merged = DedupRecordFactory
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
}
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN");
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
assertEquals(pub_merged.getInstance().size(), 3);
assertEquals(pub_merged.getCountry().size(), 2);
assertEquals(pub_merged.getSubject().size(), 0);
assertEquals(pub_merged.getTitle().size(), 2);
assertEquals(pub_merged.getRelevantdate().size(), 0);
assertEquals(pub_merged.getDescription().size(), 0);
assertEquals(pub_merged.getSource().size(), 0);
assertEquals(pub_merged.getFulltext().size(), 0);
assertEquals(pub_merged.getFormat().size(), 0);
assertEquals(pub_merged.getContributor().size(), 0);
assertEquals(pub_merged.getCoverage().size(), 0);
assertEquals(pub_merged.getContext().size(), 0);
assertEquals(pub_merged.getExternalReference().size(), 0);
assertEquals(pub_merged.getOriginalId().size(), 3);
assertEquals(pub_merged.getCollectedfrom().size(), 3);
assertEquals(pub_merged.getPid().size(), 1);
assertEquals(pub_merged.getExtraInfo().size(), 0);
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
assertEquals(pub_top.getResourcetype().getClassid(), "");
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
assertEquals(3, pub_merged.getInstance().size());
assertEquals(2, pub_merged.getCountry().size());
assertEquals(0, pub_merged.getSubject().size());
assertEquals(2, pub_merged.getTitle().size());
assertEquals(0, pub_merged.getRelevantdate().size());
assertEquals(0, pub_merged.getDescription().size());
assertEquals(0, pub_merged.getSource().size());
assertEquals(0, pub_merged.getFulltext().size());
assertEquals(0, pub_merged.getFormat().size());
assertEquals(0, pub_merged.getContributor().size());
assertEquals(0, pub_merged.getCoverage().size());
assertEquals(0, pub_merged.getContext().size());
assertEquals(0, pub_merged.getExternalReference().size());
assertEquals(3, pub_merged.getOriginalId().size());
assertEquals(3, pub_merged.getCollectedfrom().size());
assertEquals(1, pub_merged.getPid().size());
assertEquals(0, pub_merged.getExtraInfo().size());
// verify datainfo
assertEquals(pub_merged.getDataInfo(), dataInfo);
assertEquals(dataInfo, pub_merged.getDataInfo());
// verify datepicker
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
// verify authors
assertEquals(pub_merged.getAuthor().size(), 9);
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
assertEquals(9, pub_merged.getAuthor().size());
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
// verify title
int count = 0;
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
if (title.getQualifier().getClassid().equals("main title"))
count++;
}
assertEquals(count, 1);
assertEquals(1, count);
}
@Test
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
assertEquals(pub_merged.getAuthor().size(), 27);
assertEquals(27, pub_merged.getAuthor().size());
}
@Test
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
}
@Test
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
}
@Test
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
System.out
.println(
publications5
.stream()
.map(p -> p._2().getId())
.collect(Collectors.toList()));
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
}
public DataInfo setDI() {

View File

@ -7,97 +7,57 @@ import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.MethodOrderer;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestMethodOrder;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class IdGeneratorTest {
private static List<Identifier> bestIds;
private static List<Tuple2<String, Publication>> pubs;
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
private static List<Identifier> bestIds2;
private static List<Identifier> bestIds3;
private static List<Identifier<Publication>> bestIds;
private static List<Identifier<Publication>> bestIds2;
private static List<Identifier<Publication>> bestIds3;
private static String testEntityBasePath;
private static SimpleDateFormat sdf;
private static Date baseDate;
@BeforeAll
public static void setUp() throws Exception {
sdf = new SimpleDateFormat("yyyy-MM-dd");
baseDate = sdf.parse("2000-01-01");
bestIds = new ArrayList<>();
bestIds2 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
bestIds3 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
testEntityBasePath = Paths
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
.toFile()
.getAbsolutePath();
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
}
@Test
@Order(1)
public void bestPidToIdentifierTest() {
List<String> typesForAssertions = Lists
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
for (Tuple2<String, Publication> pub : pubs) {
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
bestIds.addAll(ids);
}
}
@Test
@Order(2)
public void generateIdTest1() {
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
System.out
.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
.println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1);
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
}
@Test
@ -106,14 +66,22 @@ public class IdGeneratorTest {
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
System.out
.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
.println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 2 = " + id1);
System.out
.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
.println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 3 = " + id2);
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
}
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
final Stream<Identifier<T>> ids = readSample(path, clazz)
.stream()
.map(Tuple2::_2)
.map(Identifier::newInstance);
return ids.collect(Collectors.toList());
}
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
@ -127,7 +95,7 @@ public class IdGeneratorTest {
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
OBJECT_MAPPER.readValue(line, clazz)));
// read next line
line = reader.readLine();
}
@ -140,23 +108,10 @@ public class IdGeneratorTest {
}
public static StructuredProperty pid(String pid, String classid, String classname) {
StructuredProperty sp = new StructuredProperty();
sp.setValue(pid);
Qualifier q = new Qualifier();
q.setSchemeid(classid);
q.setSchemename(classname);
q.setClassname(classname);
q.setClassid(classid);
sp.setQualifier(q);
return sp;
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
}
public static List<KeyValue> keyValue(String key, String value) {
KeyValue kv = new KeyValue();
kv.setKey(key);
kv.setValue(value);
return Lists.newArrayList(kv);
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
}
}

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }