forked from antonis.lempesis/dnet-hadoop
dedup workflow using the common PidComparator
This commit is contained in:
parent
ea2a0ea949
commit
e5da4ee9b1
|
@ -0,0 +1,27 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class OrganizationPidComparator implements Comparator<PidType> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -45,70 +45,10 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
|||
}
|
||||
|
||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||
if (lClass.equals(PidType.doi))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.handle))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.NCID))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.NCID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.GBIF))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.GBIF))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.nct))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
return new ResultPidComparator().compare(lClass, rClass);
|
||||
}
|
||||
|
||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||
if (lClass.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
return new OrganizationPidComparator().compare(lClass, rClass);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,10 +9,21 @@ public enum PidType {
|
|||
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||
|
||||
// Organization
|
||||
GRID, mag_id, urn;
|
||||
GRID, mag_id, urn,
|
||||
|
||||
// Used by dedup
|
||||
undefined, original;
|
||||
|
||||
public static boolean isValid(String type) {
|
||||
return EnumUtils.isValidEnum(PidType.class, type);
|
||||
}
|
||||
|
||||
public static PidType tryValueOf(String s) {
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
} catch (Exception e) {
|
||||
return PidType.original;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class ResultPidComparator implements Comparator<PidType> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.doi))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.handle))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.NCID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.NCID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.GBIF))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GBIF))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.nct))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -82,7 +82,7 @@ public class DedupRecordFactory {
|
|||
|
||||
final Collection<String> dates = Lists.newArrayList();
|
||||
final List<List<Author>> authors = Lists.newArrayList();
|
||||
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
|
||||
final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
|
||||
|
||||
entities
|
||||
.forEachRemaining(
|
||||
|
@ -90,7 +90,7 @@ public class DedupRecordFactory {
|
|||
T duplicate = t._2();
|
||||
|
||||
// prepare the list of pids to use for the id generation
|
||||
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
|
||||
bestPids.add(Identifier.newInstance(duplicate));
|
||||
|
||||
entity.mergeFrom(duplicate);
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
|
|
|
@ -1,124 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
public class IdGenerator implements Serializable {
|
||||
|
||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||
public static String BASE_DATE = "2000-01-01";
|
||||
|
||||
// pick the best pid from the list (consider date and pidtype)
|
||||
public static String generate(List<Identifier> pids, String defaultID) {
|
||||
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
|
||||
if (pids == null || pids.size() == 0)
|
||||
return defaultID;
|
||||
|
||||
Optional<Identifier> bp = pids
|
||||
Identifier<T> bp = pids
|
||||
.stream()
|
||||
.max(Identifier::compareTo);
|
||||
.min(Identifier::compareTo)
|
||||
.get();
|
||||
|
||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
||||
+ DHPUtils.md5(bp.get().getOriginalID());
|
||||
String prefix = substringBefore(bp.getOriginalID(), "|");
|
||||
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
|
||||
String suffix = substringAfter(bp.getOriginalID(), "::");
|
||||
|
||||
final String pidType = substringBefore(ns, "_");
|
||||
if (PidType.isValid(pidType)) {
|
||||
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
||||
} else {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
||||
+ DHPUtils.md5(bp.get().getPid().getValue());
|
||||
return prefix + "|dedup_wf_001::" + suffix;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
||||
|
||||
Date date;
|
||||
try {
|
||||
date = sdf.parse(BASE_DATE);
|
||||
} catch (ParseException e) {
|
||||
date = new Date();
|
||||
}
|
||||
return Lists
|
||||
.newArrayList(
|
||||
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||
}
|
||||
|
||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||
return createBasePid(entity, sdf);
|
||||
|
||||
Optional<StructuredProperty> bp = entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
||||
|
||||
return bp
|
||||
.map(
|
||||
structuredProperty -> Lists
|
||||
.newArrayList(
|
||||
new Identifier(structuredProperty, extractDate(entity, sdf),
|
||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
||||
.orElseGet(() -> createBasePid(entity, sdf));
|
||||
|
||||
}
|
||||
|
||||
// create the prefix (length = 12): dedup_+ pidType
|
||||
public static String createPrefix(String pidType) {
|
||||
|
||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
||||
|
||||
private static String dedupify(String ns) {
|
||||
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||
while (prefix.length() < 12) {
|
||||
prefix.append("_");
|
||||
}
|
||||
return prefix.toString().substring(0, 12);
|
||||
|
||||
}
|
||||
|
||||
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
|
||||
// 00-01-01
|
||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
||||
|
||||
String date = BASE_DATE;
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result result = (Result) duplicate;
|
||||
if (isWellformed(result.getDateofacceptance())) {
|
||||
date = result.getDateofacceptance().getValue();
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return sdf.parse(date);
|
||||
} catch (ParseException e) {
|
||||
return new Date();
|
||||
return prefix.substring(0, 12);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static boolean isWellformed(Field<String> date) {
|
||||
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,94 +2,85 @@
|
|||
package eu.dnetlib.dhp.oa.dedup.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
|
||||
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
public class Identifier implements Serializable, Comparable<Identifier> {
|
||||
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
|
||||
|
||||
StructuredProperty pid;
|
||||
Date date;
|
||||
PidType type;
|
||||
List<KeyValue> collectedFrom;
|
||||
EntityType entityType;
|
||||
String originalID;
|
||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||
public static String BASE_DATE = "2000-01-01";
|
||||
|
||||
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
|
||||
// ID
|
||||
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
|
||||
EntityType entityType, String originalID) {
|
||||
this.pid = pid;
|
||||
this.date = date;
|
||||
this.type = type;
|
||||
this.collectedFrom = collectedFrom;
|
||||
this.entityType = entityType;
|
||||
this.originalID = originalID;
|
||||
private T entity;
|
||||
|
||||
public static <T extends OafEntity> Identifier newInstance(T entity) {
|
||||
return new Identifier(entity);
|
||||
}
|
||||
|
||||
public StructuredProperty getPid() {
|
||||
return pid;
|
||||
public Identifier(T entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
|
||||
public void setPid(StructuredProperty pid) {
|
||||
this.pid = pid;
|
||||
public T getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public void setEntity(T entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
return date;
|
||||
String date = BASE_DATE;
|
||||
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
||||
Result result = (Result) getEntity();
|
||||
if (isWellformed(result.getDateofacceptance())) {
|
||||
date = result.getDateofacceptance().getValue();
|
||||
}
|
||||
}
|
||||
try {
|
||||
return sdf.parse(date);
|
||||
} catch (ParseException e) {
|
||||
return new Date();
|
||||
}
|
||||
}
|
||||
|
||||
public void setDate(Date date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PidType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(PidType type) {
|
||||
this.type = type;
|
||||
private static boolean isWellformed(Field<String> date) {
|
||||
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
return entity.getCollectedfrom();
|
||||
}
|
||||
|
||||
public EntityType getEntityType() {
|
||||
return entityType;
|
||||
}
|
||||
|
||||
public void setEntityType(EntityType entityType) {
|
||||
this.entityType = entityType;
|
||||
return EntityType.fromClass(entity.getClass());
|
||||
}
|
||||
|
||||
public String getOriginalID() {
|
||||
return originalID;
|
||||
return entity.getId();
|
||||
}
|
||||
|
||||
public void setOriginalID(String originalID) {
|
||||
this.originalID = originalID;
|
||||
}
|
||||
|
||||
public boolean isUseOriginal() {
|
||||
return useOriginal;
|
||||
}
|
||||
|
||||
public void setUseOriginal(boolean useOriginal) {
|
||||
this.useOriginal = useOriginal;
|
||||
private PidType getPidType() {
|
||||
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -97,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
|||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||
// alphabetical order of the originalID
|
||||
|
||||
Set<String> lKeys = Sets.newHashSet();
|
||||
if (this.collectedFrom != null)
|
||||
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||
Set<String> lKeys = Optional
|
||||
.ofNullable(getCollectedFrom())
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
Set<String> rKeys = Sets.newHashSet();
|
||||
if (i.getCollectedFrom() != null)
|
||||
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
||||
Set<String> rKeys = cf
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
||||
if (entityType == EntityType.publication) {
|
||||
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
|
||||
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
||||
if (getEntityType() == EntityType.publication) {
|
||||
if (isFromDatasourceID(lKeys, CROSSREF_ID)
|
||||
&& !isFromDatasourceID(rKeys, CROSSREF_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, CROSSREF_ID)
|
||||
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
|
||||
return 1;
|
||||
}
|
||||
if (entityType == EntityType.dataset) {
|
||||
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
|
||||
if (getEntityType() == EntityType.dataset) {
|
||||
if (isFromDatasourceID(lKeys, DATACITE_ID)
|
||||
&& !isFromDatasourceID(rKeys, DATACITE_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, DATACITE_ID)
|
||||
&& !isFromDatasourceID(lKeys, DATACITE_ID))
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||
|
||||
if (this.originalID.compareTo(i.originalID) < 0)
|
||||
this.useOriginal = true;
|
||||
else
|
||||
i.setUseOriginal(true);
|
||||
|
||||
// the minus because we need to take the alphabetically lower id
|
||||
return -this.originalID.compareTo(i.originalID);
|
||||
return this.getOriginalID().compareTo(i.getOriginalID());
|
||||
} else
|
||||
// the minus is because we need to take the elder date
|
||||
return -this.getDate().compareTo(i.getDate());
|
||||
return this.getDate().compareTo(i.getDate());
|
||||
} else {
|
||||
return this.getType().compareTo(i.getType());
|
||||
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private StructuredProperty toSP(PidType pidType) {
|
||||
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
||||
}
|
||||
|
||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||
return collectedFrom.contains(dsId);
|
||||
}
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup.model;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
// from the less to the more important
|
||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
||||
|
||||
public static PidType classidValueOf(String s) {
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
} catch (Exception e) {
|
||||
return PidType.undefined;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -9,6 +9,7 @@ import java.io.IOException;
|
|||
import java.io.Serializable;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
|
@ -21,16 +22,16 @@ import scala.Tuple2;
|
|||
|
||||
public class EntityMergerTest implements Serializable {
|
||||
|
||||
List<Tuple2<String, Publication>> publications;
|
||||
List<Tuple2<String, Publication>> publications2;
|
||||
List<Tuple2<String, Publication>> publications3;
|
||||
List<Tuple2<String, Publication>> publications4;
|
||||
List<Tuple2<String, Publication>> publications5;
|
||||
private List<Tuple2<String, Publication>> publications;
|
||||
private List<Tuple2<String, Publication>> publications2;
|
||||
private List<Tuple2<String, Publication>> publications3;
|
||||
private List<Tuple2<String, Publication>> publications4;
|
||||
private List<Tuple2<String, Publication>> publications5;
|
||||
|
||||
String testEntityBasePath;
|
||||
DataInfo dataInfo;
|
||||
String dedupId = "00|dedup_id::1";
|
||||
Publication pub_top;
|
||||
private String testEntityBasePath;
|
||||
private DataInfo dataInfo;
|
||||
private String dedupId = "00|dedup_id::1";
|
||||
private Publication pub_top;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
|
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
|
|||
Software merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||
|
||||
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
||||
assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
|
||||
|
||||
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
|
||||
|
||||
}
|
||||
|
||||
|
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
|
|||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
|
||||
|
||||
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
||||
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN");
|
||||
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
|
||||
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
|
||||
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
|
||||
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
|
||||
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
|
||||
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
|
||||
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
|
||||
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
|
||||
assertEquals(pub_merged.getInstance().size(), 3);
|
||||
assertEquals(pub_merged.getCountry().size(), 2);
|
||||
assertEquals(pub_merged.getSubject().size(), 0);
|
||||
assertEquals(pub_merged.getTitle().size(), 2);
|
||||
assertEquals(pub_merged.getRelevantdate().size(), 0);
|
||||
assertEquals(pub_merged.getDescription().size(), 0);
|
||||
assertEquals(pub_merged.getSource().size(), 0);
|
||||
assertEquals(pub_merged.getFulltext().size(), 0);
|
||||
assertEquals(pub_merged.getFormat().size(), 0);
|
||||
assertEquals(pub_merged.getContributor().size(), 0);
|
||||
assertEquals(pub_merged.getCoverage().size(), 0);
|
||||
assertEquals(pub_merged.getContext().size(), 0);
|
||||
assertEquals(pub_merged.getExternalReference().size(), 0);
|
||||
assertEquals(pub_merged.getOriginalId().size(), 3);
|
||||
assertEquals(pub_merged.getCollectedfrom().size(), 3);
|
||||
assertEquals(pub_merged.getPid().size(), 1);
|
||||
assertEquals(pub_merged.getExtraInfo().size(), 0);
|
||||
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
|
||||
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
||||
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
||||
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
||||
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
|
||||
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
|
||||
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
||||
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
||||
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
||||
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
||||
assertEquals(3, pub_merged.getInstance().size());
|
||||
assertEquals(2, pub_merged.getCountry().size());
|
||||
assertEquals(0, pub_merged.getSubject().size());
|
||||
assertEquals(2, pub_merged.getTitle().size());
|
||||
assertEquals(0, pub_merged.getRelevantdate().size());
|
||||
assertEquals(0, pub_merged.getDescription().size());
|
||||
assertEquals(0, pub_merged.getSource().size());
|
||||
assertEquals(0, pub_merged.getFulltext().size());
|
||||
assertEquals(0, pub_merged.getFormat().size());
|
||||
assertEquals(0, pub_merged.getContributor().size());
|
||||
assertEquals(0, pub_merged.getCoverage().size());
|
||||
assertEquals(0, pub_merged.getContext().size());
|
||||
assertEquals(0, pub_merged.getExternalReference().size());
|
||||
assertEquals(3, pub_merged.getOriginalId().size());
|
||||
assertEquals(3, pub_merged.getCollectedfrom().size());
|
||||
assertEquals(1, pub_merged.getPid().size());
|
||||
assertEquals(0, pub_merged.getExtraInfo().size());
|
||||
|
||||
// verify datainfo
|
||||
assertEquals(pub_merged.getDataInfo(), dataInfo);
|
||||
assertEquals(dataInfo, pub_merged.getDataInfo());
|
||||
|
||||
// verify datepicker
|
||||
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
||||
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
||||
|
||||
// verify authors
|
||||
assertEquals(pub_merged.getAuthor().size(), 9);
|
||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
||||
assertEquals(9, pub_merged.getAuthor().size());
|
||||
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
||||
|
||||
// verify title
|
||||
int count = 0;
|
||||
|
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
|
|||
if (title.getQualifier().getClassid().equals("main title"))
|
||||
count++;
|
||||
}
|
||||
assertEquals(count, 1);
|
||||
assertEquals(1, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
|
|||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
|
||||
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||
assertEquals(27, pub_merged.getAuthor().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
|
|||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
|
|||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
||||
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||
|
||||
System.out
|
||||
.println(
|
||||
publications5
|
||||
.stream()
|
||||
.map(p -> p._2().getId())
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
||||
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
}
|
||||
|
||||
public DataInfo setDI() {
|
||||
|
|
|
@ -7,97 +7,57 @@ import java.io.BufferedReader;
|
|||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.MethodOrderer;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestMethodOrder;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class IdGeneratorTest {
|
||||
|
||||
private static List<Identifier> bestIds;
|
||||
private static List<Tuple2<String, Publication>> pubs;
|
||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
private static List<Identifier> bestIds2;
|
||||
private static List<Identifier> bestIds3;
|
||||
private static List<Identifier<Publication>> bestIds;
|
||||
private static List<Identifier<Publication>> bestIds2;
|
||||
private static List<Identifier<Publication>> bestIds3;
|
||||
|
||||
private static String testEntityBasePath;
|
||||
|
||||
private static SimpleDateFormat sdf;
|
||||
private static Date baseDate;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() throws Exception {
|
||||
|
||||
sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
baseDate = sdf.parse("2000-01-01");
|
||||
|
||||
bestIds = new ArrayList<>();
|
||||
bestIds2 = Lists
|
||||
.newArrayList(
|
||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
|
||||
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
||||
bestIds3 = Lists
|
||||
.newArrayList(
|
||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
|
||||
EntityType.publication, "50|originalID2"),
|
||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
||||
|
||||
testEntityBasePath = Paths
|
||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||
|
||||
bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
|
||||
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(1)
|
||||
public void bestPidToIdentifierTest() {
|
||||
|
||||
List<String> typesForAssertions = Lists
|
||||
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
|
||||
|
||||
for (Tuple2<String, Publication> pub : pubs) {
|
||||
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
|
||||
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
|
||||
bestIds.addAll(ids);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(2)
|
||||
public void generateIdTest1() {
|
||||
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
||||
|
||||
System.out
|
||||
.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
||||
.println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||
|
||||
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1);
|
||||
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -106,14 +66,22 @@ public class IdGeneratorTest {
|
|||
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
||||
|
||||
System.out
|
||||
.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
||||
.println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||
System.out.println("winner 2 = " + id1);
|
||||
System.out
|
||||
.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
||||
.println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||
System.out.println("winner 3 = " + id2);
|
||||
|
||||
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
|
||||
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
|
||||
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
||||
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
||||
}
|
||||
|
||||
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
|
||||
final Stream<Identifier<T>> ids = readSample(path, clazz)
|
||||
.stream()
|
||||
.map(Tuple2::_2)
|
||||
.map(Identifier::newInstance);
|
||||
return ids.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||
|
@ -127,7 +95,7 @@ public class IdGeneratorTest {
|
|||
.add(
|
||||
new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString("$.id", line),
|
||||
new ObjectMapper().readValue(line, clazz)));
|
||||
OBJECT_MAPPER.readValue(line, clazz)));
|
||||
// read next line
|
||||
line = reader.readLine();
|
||||
}
|
||||
|
@ -140,23 +108,10 @@ public class IdGeneratorTest {
|
|||
}
|
||||
|
||||
public static StructuredProperty pid(String pid, String classid, String classname) {
|
||||
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(pid);
|
||||
Qualifier q = new Qualifier();
|
||||
q.setSchemeid(classid);
|
||||
q.setSchemename(classname);
|
||||
q.setClassname(classname);
|
||||
q.setClassid(classid);
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
|
||||
}
|
||||
|
||||
public static List<KeyValue> keyValue(String key, String value) {
|
||||
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey(key);
|
||||
kv.setValue(value);
|
||||
return Lists.newArrayList(kv);
|
||||
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,3 @@
|
|||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
|
@ -0,0 +1,3 @@
|
|||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue