forked from antonis.lempesis/dnet-hadoop
dedup workflow using the common PidComparator
This commit is contained in:
parent
ea2a0ea949
commit
e5da4ee9b1
|
@ -0,0 +1,27 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class OrganizationPidComparator implements Comparator<PidType> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(PidType pLeft, PidType pRight) {
|
||||||
|
if (pLeft.equals(PidType.GRID))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.GRID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.mag_id))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.mag_id))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.urn))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.urn))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -45,70 +45,10 @@ public class PidComparator<T extends OafEntity> implements Comparator<Structured
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||||
if (lClass.equals(PidType.doi))
|
return new ResultPidComparator().compare(lClass, rClass);
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.doi))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.pmid))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.pmid))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.pmc))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.pmc))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.handle))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.handle))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.arXiv))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.arXiv))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.NCID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.NCID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.GBIF))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.GBIF))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.nct))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.nct))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.urn))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.urn))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||||
if (lClass.equals(PidType.GRID))
|
return new OrganizationPidComparator().compare(lClass, rClass);
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.GRID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.mag_id))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.mag_id))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.urn))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.urn))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,10 +9,21 @@ public enum PidType {
|
||||||
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||||
|
|
||||||
// Organization
|
// Organization
|
||||||
GRID, mag_id, urn;
|
GRID, mag_id, urn,
|
||||||
|
|
||||||
|
// Used by dedup
|
||||||
|
undefined, original;
|
||||||
|
|
||||||
public static boolean isValid(String type) {
|
public static boolean isValid(String type) {
|
||||||
return EnumUtils.isValidEnum(PidType.class, type);
|
return EnumUtils.isValidEnum(PidType.class, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static PidType tryValueOf(String s) {
|
||||||
|
try {
|
||||||
|
return PidType.valueOf(s);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return PidType.original;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class ResultPidComparator implements Comparator<PidType> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(PidType pLeft, PidType pRight) {
|
||||||
|
if (pLeft.equals(PidType.doi))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.doi))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.pmid))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.pmid))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.pmc))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.pmc))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.handle))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.handle))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.arXiv))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.arXiv))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.NCID))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.NCID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.GBIF))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.GBIF))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.nct))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.nct))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.urn))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.urn))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -82,7 +82,7 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
final Collection<String> dates = Lists.newArrayList();
|
final Collection<String> dates = Lists.newArrayList();
|
||||||
final List<List<Author>> authors = Lists.newArrayList();
|
final List<List<Author>> authors = Lists.newArrayList();
|
||||||
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
|
final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
|
||||||
|
|
||||||
entities
|
entities
|
||||||
.forEachRemaining(
|
.forEachRemaining(
|
||||||
|
@ -90,7 +90,7 @@ public class DedupRecordFactory {
|
||||||
T duplicate = t._2();
|
T duplicate = t._2();
|
||||||
|
|
||||||
// prepare the list of pids to use for the id generation
|
// prepare the list of pids to use for the id generation
|
||||||
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
|
bestPids.add(Identifier.newInstance(duplicate));
|
||||||
|
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
|
|
|
@ -1,124 +1,46 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
||||||
|
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.text.ParseException;
|
import java.util.List;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
public class IdGenerator implements Serializable {
|
public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
|
||||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
|
||||||
public static String BASE_DATE = "2000-01-01";
|
|
||||||
|
|
||||||
// pick the best pid from the list (consider date and pidtype)
|
// pick the best pid from the list (consider date and pidtype)
|
||||||
public static String generate(List<Identifier> pids, String defaultID) {
|
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
|
||||||
if (pids == null || pids.size() == 0)
|
if (pids == null || pids.size() == 0)
|
||||||
return defaultID;
|
return defaultID;
|
||||||
|
|
||||||
Optional<Identifier> bp = pids
|
Identifier<T> bp = pids
|
||||||
.stream()
|
.stream()
|
||||||
.max(Identifier::compareTo);
|
.min(Identifier::compareTo)
|
||||||
|
.get();
|
||||||
|
|
||||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
String prefix = substringBefore(bp.getOriginalID(), "|");
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
|
||||||
+ DHPUtils.md5(bp.get().getOriginalID());
|
String suffix = substringAfter(bp.getOriginalID(), "::");
|
||||||
|
|
||||||
|
final String pidType = substringBefore(ns, "_");
|
||||||
|
if (PidType.isValid(pidType)) {
|
||||||
|
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
||||||
} else {
|
} else {
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
return prefix + "|dedup_wf_001::" + suffix;
|
||||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
|
||||||
+ DHPUtils.md5(bp.get().getPid().getValue());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
private static String dedupify(String ns) {
|
||||||
|
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||||
Date date;
|
|
||||||
try {
|
|
||||||
date = sdf.parse(BASE_DATE);
|
|
||||||
} catch (ParseException e) {
|
|
||||||
date = new Date();
|
|
||||||
}
|
|
||||||
return Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
|
||||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
|
||||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
|
||||||
|
|
||||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
|
||||||
|
|
||||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
|
||||||
return createBasePid(entity, sdf);
|
|
||||||
|
|
||||||
Optional<StructuredProperty> bp = entity
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
|
||||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
|
||||||
|
|
||||||
return bp
|
|
||||||
.map(
|
|
||||||
structuredProperty -> Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(structuredProperty, extractDate(entity, sdf),
|
|
||||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
|
||||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
|
||||||
.orElseGet(() -> createBasePid(entity, sdf));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// create the prefix (length = 12): dedup_+ pidType
|
|
||||||
public static String createPrefix(String pidType) {
|
|
||||||
|
|
||||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
|
||||||
|
|
||||||
while (prefix.length() < 12) {
|
while (prefix.length() < 12) {
|
||||||
prefix.append("_");
|
prefix.append("_");
|
||||||
}
|
}
|
||||||
return prefix.toString().substring(0, 12);
|
return prefix.substring(0, 12);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
|
|
||||||
// 00-01-01
|
|
||||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
|
||||||
|
|
||||||
String date = BASE_DATE;
|
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
|
||||||
Result result = (Result) duplicate;
|
|
||||||
if (isWellformed(result.getDateofacceptance())) {
|
|
||||||
date = result.getDateofacceptance().getValue();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return sdf.parse(date);
|
|
||||||
} catch (ParseException e) {
|
|
||||||
return new Date();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean isWellformed(Field<String> date) {
|
|
||||||
return date != null && StringUtils.isNotBlank(date.getValue())
|
|
||||||
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,94 +2,85 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup.model;
|
package eu.dnetlib.dhp.oa.dedup.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
|
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
public class Identifier implements Serializable, Comparable<Identifier> {
|
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
|
||||||
|
|
||||||
StructuredProperty pid;
|
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||||
Date date;
|
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||||
PidType type;
|
public static String BASE_DATE = "2000-01-01";
|
||||||
List<KeyValue> collectedFrom;
|
|
||||||
EntityType entityType;
|
|
||||||
String originalID;
|
|
||||||
|
|
||||||
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
|
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
// ID
|
|
||||||
|
|
||||||
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
|
private T entity;
|
||||||
EntityType entityType, String originalID) {
|
|
||||||
this.pid = pid;
|
public static <T extends OafEntity> Identifier newInstance(T entity) {
|
||||||
this.date = date;
|
return new Identifier(entity);
|
||||||
this.type = type;
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
this.entityType = entityType;
|
|
||||||
this.originalID = originalID;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public StructuredProperty getPid() {
|
public Identifier(T entity) {
|
||||||
return pid;
|
this.entity = entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPid(StructuredProperty pid) {
|
public T getEntity() {
|
||||||
this.pid = pid;
|
return entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEntity(T entity) {
|
||||||
|
this.entity = entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getDate() {
|
public Date getDate() {
|
||||||
return date;
|
String date = BASE_DATE;
|
||||||
|
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
||||||
|
Result result = (Result) getEntity();
|
||||||
|
if (isWellformed(result.getDateofacceptance())) {
|
||||||
|
date = result.getDateofacceptance().getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
return sdf.parse(date);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDate(Date date) {
|
private static boolean isWellformed(Field<String> date) {
|
||||||
this.date = date;
|
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||||
}
|
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||||
|
|
||||||
public PidType getType() {
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setType(PidType type) {
|
|
||||||
this.type = type;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<KeyValue> getCollectedFrom() {
|
public List<KeyValue> getCollectedFrom() {
|
||||||
return collectedFrom;
|
return entity.getCollectedfrom();
|
||||||
}
|
|
||||||
|
|
||||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EntityType getEntityType() {
|
public EntityType getEntityType() {
|
||||||
return entityType;
|
return EntityType.fromClass(entity.getClass());
|
||||||
}
|
|
||||||
|
|
||||||
public void setEntityType(EntityType entityType) {
|
|
||||||
this.entityType = entityType;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOriginalID() {
|
public String getOriginalID() {
|
||||||
return originalID;
|
return entity.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOriginalID(String originalID) {
|
private PidType getPidType() {
|
||||||
this.originalID = originalID;
|
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isUseOriginal() {
|
|
||||||
return useOriginal;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUseOriginal(boolean useOriginal) {
|
|
||||||
this.useOriginal = useOriginal;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -97,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||||
// alphabetical order of the originalID
|
// alphabetical order of the originalID
|
||||||
|
|
||||||
Set<String> lKeys = Sets.newHashSet();
|
Set<String> lKeys = Optional
|
||||||
if (this.collectedFrom != null)
|
.ofNullable(getCollectedFrom())
|
||||||
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||||
|
.orElse(Sets.newHashSet());
|
||||||
|
|
||||||
Set<String> rKeys = Sets.newHashSet();
|
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
||||||
if (i.getCollectedFrom() != null)
|
Set<String> rKeys = cf
|
||||||
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||||
|
.orElse(Sets.newHashSet());
|
||||||
|
|
||||||
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
||||||
if (entityType == EntityType.publication) {
|
if (getEntityType() == EntityType.publication) {
|
||||||
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
|
if (isFromDatasourceID(lKeys, CROSSREF_ID)
|
||||||
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
|
&& !isFromDatasourceID(rKeys, CROSSREF_ID))
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
|
|
||||||
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
|
|
||||||
return -1;
|
return -1;
|
||||||
|
if (isFromDatasourceID(rKeys, CROSSREF_ID)
|
||||||
|
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
if (entityType == EntityType.dataset) {
|
if (getEntityType() == EntityType.dataset) {
|
||||||
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
|
if (isFromDatasourceID(lKeys, DATACITE_ID)
|
||||||
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
|
&& !isFromDatasourceID(rKeys, DATACITE_ID))
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
|
|
||||||
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
|
|
||||||
return -1;
|
return -1;
|
||||||
|
if (isFromDatasourceID(rKeys, DATACITE_ID)
|
||||||
|
&& !isFromDatasourceID(lKeys, DATACITE_ID))
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||||
|
|
||||||
if (this.originalID.compareTo(i.originalID) < 0)
|
|
||||||
this.useOriginal = true;
|
|
||||||
else
|
|
||||||
i.setUseOriginal(true);
|
|
||||||
|
|
||||||
// the minus because we need to take the alphabetically lower id
|
// the minus because we need to take the alphabetically lower id
|
||||||
return -this.originalID.compareTo(i.originalID);
|
return this.getOriginalID().compareTo(i.getOriginalID());
|
||||||
} else
|
} else
|
||||||
// the minus is because we need to take the elder date
|
// the minus is because we need to take the elder date
|
||||||
return -this.getDate().compareTo(i.getDate());
|
return this.getDate().compareTo(i.getDate());
|
||||||
} else {
|
} else {
|
||||||
return this.getType().compareTo(i.getType());
|
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private StructuredProperty toSP(PidType pidType) {
|
||||||
|
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||||
return collectedFrom.contains(dsId);
|
return collectedFrom.contains(dsId);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup.model;
|
|
||||||
|
|
||||||
public enum PidType {
|
|
||||||
|
|
||||||
// from the less to the more important
|
|
||||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
|
||||||
|
|
||||||
public static PidType classidValueOf(String s) {
|
|
||||||
try {
|
|
||||||
return PidType.valueOf(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return PidType.undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -9,6 +9,7 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
@ -21,16 +22,16 @@ import scala.Tuple2;
|
||||||
|
|
||||||
public class EntityMergerTest implements Serializable {
|
public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
List<Tuple2<String, Publication>> publications;
|
private List<Tuple2<String, Publication>> publications;
|
||||||
List<Tuple2<String, Publication>> publications2;
|
private List<Tuple2<String, Publication>> publications2;
|
||||||
List<Tuple2<String, Publication>> publications3;
|
private List<Tuple2<String, Publication>> publications3;
|
||||||
List<Tuple2<String, Publication>> publications4;
|
private List<Tuple2<String, Publication>> publications4;
|
||||||
List<Tuple2<String, Publication>> publications5;
|
private List<Tuple2<String, Publication>> publications5;
|
||||||
|
|
||||||
String testEntityBasePath;
|
private String testEntityBasePath;
|
||||||
DataInfo dataInfo;
|
private DataInfo dataInfo;
|
||||||
String dedupId = "00|dedup_id::1";
|
private String dedupId = "00|dedup_id::1";
|
||||||
Publication pub_top;
|
private Publication pub_top;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
|
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
|
||||||
Software merged = DedupRecordFactory
|
Software merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||||
|
|
||||||
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
|
||||||
|
|
||||||
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
|
||||||
|
|
||||||
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
|
||||||
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN");
|
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
||||||
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
|
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
||||||
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
|
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
||||||
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
|
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
|
||||||
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
|
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
|
||||||
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
|
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
||||||
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
|
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
||||||
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
|
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
||||||
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
|
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
||||||
assertEquals(pub_merged.getInstance().size(), 3);
|
assertEquals(3, pub_merged.getInstance().size());
|
||||||
assertEquals(pub_merged.getCountry().size(), 2);
|
assertEquals(2, pub_merged.getCountry().size());
|
||||||
assertEquals(pub_merged.getSubject().size(), 0);
|
assertEquals(0, pub_merged.getSubject().size());
|
||||||
assertEquals(pub_merged.getTitle().size(), 2);
|
assertEquals(2, pub_merged.getTitle().size());
|
||||||
assertEquals(pub_merged.getRelevantdate().size(), 0);
|
assertEquals(0, pub_merged.getRelevantdate().size());
|
||||||
assertEquals(pub_merged.getDescription().size(), 0);
|
assertEquals(0, pub_merged.getDescription().size());
|
||||||
assertEquals(pub_merged.getSource().size(), 0);
|
assertEquals(0, pub_merged.getSource().size());
|
||||||
assertEquals(pub_merged.getFulltext().size(), 0);
|
assertEquals(0, pub_merged.getFulltext().size());
|
||||||
assertEquals(pub_merged.getFormat().size(), 0);
|
assertEquals(0, pub_merged.getFormat().size());
|
||||||
assertEquals(pub_merged.getContributor().size(), 0);
|
assertEquals(0, pub_merged.getContributor().size());
|
||||||
assertEquals(pub_merged.getCoverage().size(), 0);
|
assertEquals(0, pub_merged.getCoverage().size());
|
||||||
assertEquals(pub_merged.getContext().size(), 0);
|
assertEquals(0, pub_merged.getContext().size());
|
||||||
assertEquals(pub_merged.getExternalReference().size(), 0);
|
assertEquals(0, pub_merged.getExternalReference().size());
|
||||||
assertEquals(pub_merged.getOriginalId().size(), 3);
|
assertEquals(3, pub_merged.getOriginalId().size());
|
||||||
assertEquals(pub_merged.getCollectedfrom().size(), 3);
|
assertEquals(3, pub_merged.getCollectedfrom().size());
|
||||||
assertEquals(pub_merged.getPid().size(), 1);
|
assertEquals(1, pub_merged.getPid().size());
|
||||||
assertEquals(pub_merged.getExtraInfo().size(), 0);
|
assertEquals(0, pub_merged.getExtraInfo().size());
|
||||||
|
|
||||||
// verify datainfo
|
// verify datainfo
|
||||||
assertEquals(pub_merged.getDataInfo(), dataInfo);
|
assertEquals(dataInfo, pub_merged.getDataInfo());
|
||||||
|
|
||||||
// verify datepicker
|
// verify datepicker
|
||||||
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
||||||
|
|
||||||
// verify authors
|
// verify authors
|
||||||
assertEquals(pub_merged.getAuthor().size(), 9);
|
assertEquals(9, pub_merged.getAuthor().size());
|
||||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
||||||
|
|
||||||
// verify title
|
// verify title
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
if (title.getQualifier().getClassid().equals("main title"))
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
assertEquals(count, 1);
|
assertEquals(1, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
|
|
||||||
assertEquals(pub_merged.getAuthor().size(), 27);
|
assertEquals(27, pub_merged.getAuthor().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||||
|
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
publications5
|
||||||
|
.stream()
|
||||||
|
.map(p -> p._2().getId())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
public DataInfo setDI() {
|
public DataInfo setDI() {
|
||||||
|
|
|
@ -7,97 +7,57 @@ import java.io.BufferedReader;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.MethodOrderer;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.TestMethodOrder;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class IdGeneratorTest {
|
public class IdGeneratorTest {
|
||||||
|
|
||||||
private static List<Identifier> bestIds;
|
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
private static List<Tuple2<String, Publication>> pubs;
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
private static List<Identifier> bestIds2;
|
private static List<Identifier<Publication>> bestIds;
|
||||||
private static List<Identifier> bestIds3;
|
private static List<Identifier<Publication>> bestIds2;
|
||||||
|
private static List<Identifier<Publication>> bestIds3;
|
||||||
|
|
||||||
private static String testEntityBasePath;
|
private static String testEntityBasePath;
|
||||||
|
|
||||||
private static SimpleDateFormat sdf;
|
|
||||||
private static Date baseDate;
|
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUp() throws Exception {
|
public static void setUp() throws Exception {
|
||||||
|
|
||||||
sdf = new SimpleDateFormat("yyyy-MM-dd");
|
|
||||||
baseDate = sdf.parse("2000-01-01");
|
|
||||||
|
|
||||||
bestIds = new ArrayList<>();
|
|
||||||
bestIds2 = Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
|
||||||
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
|
||||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
|
||||||
bestIds3 = Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
|
||||||
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
|
|
||||||
EntityType.publication, "50|originalID2"),
|
|
||||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
|
||||||
|
|
||||||
testEntityBasePath = Paths
|
testEntityBasePath = Paths
|
||||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||||
.toFile()
|
.toFile()
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||||
|
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
|
||||||
|
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(1)
|
|
||||||
public void bestPidToIdentifierTest() {
|
|
||||||
|
|
||||||
List<String> typesForAssertions = Lists
|
|
||||||
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
|
|
||||||
|
|
||||||
for (Tuple2<String, Publication> pub : pubs) {
|
|
||||||
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
|
|
||||||
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
|
|
||||||
bestIds.addAll(ids);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Order(2)
|
|
||||||
public void generateIdTest1() {
|
public void generateIdTest1() {
|
||||||
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
||||||
|
|
||||||
System.out
|
System.out
|
||||||
.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
.println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
|
|
||||||
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1);
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -106,14 +66,22 @@ public class IdGeneratorTest {
|
||||||
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
||||||
|
|
||||||
System.out
|
System.out
|
||||||
.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
.println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
System.out.println("winner 2 = " + id1);
|
System.out.println("winner 2 = " + id1);
|
||||||
System.out
|
System.out
|
||||||
.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
.println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
System.out.println("winner 3 = " + id2);
|
System.out.println("winner 3 = " + id2);
|
||||||
|
|
||||||
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
|
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
||||||
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
|
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
|
||||||
|
final Stream<Identifier<T>> ids = readSample(path, clazz)
|
||||||
|
.stream()
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
.map(Identifier::newInstance);
|
||||||
|
return ids.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
@ -127,7 +95,7 @@ public class IdGeneratorTest {
|
||||||
.add(
|
.add(
|
||||||
new Tuple2<>(
|
new Tuple2<>(
|
||||||
MapDocumentUtil.getJPathString("$.id", line),
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
new ObjectMapper().readValue(line, clazz)));
|
OBJECT_MAPPER.readValue(line, clazz)));
|
||||||
// read next line
|
// read next line
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
}
|
}
|
||||||
|
@ -140,23 +108,10 @@ public class IdGeneratorTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty pid(String pid, String classid, String classname) {
|
public static StructuredProperty pid(String pid, String classid, String classname) {
|
||||||
|
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
|
||||||
StructuredProperty sp = new StructuredProperty();
|
|
||||||
sp.setValue(pid);
|
|
||||||
Qualifier q = new Qualifier();
|
|
||||||
q.setSchemeid(classid);
|
|
||||||
q.setSchemename(classname);
|
|
||||||
q.setClassname(classname);
|
|
||||||
q.setClassid(classid);
|
|
||||||
sp.setQualifier(q);
|
|
||||||
return sp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<KeyValue> keyValue(String key, String value) {
|
public static List<KeyValue> keyValue(String key, String value) {
|
||||||
|
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
|
||||||
KeyValue kv = new KeyValue();
|
|
||||||
kv.setKey(key);
|
|
||||||
kv.setValue(value);
|
|
||||||
return Lists.newArrayList(kv);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,3 @@
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
|
@ -0,0 +1,3 @@
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue