Dedup ID creation policy #48
|
@ -1,11 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -18,7 +15,6 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
@ -90,7 +86,7 @@ public class DedupRecordFactory {
|
|||
T duplicate = t._2();
|
||||
|
||||
// prepare the list of pids to use for the id generation
|
||||
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
|
||||
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
|
||||
|
||||
entity.mergeFrom(duplicate);
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
|
|
|
@ -70,17 +70,6 @@ public class DedupUtility {
|
|||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||
}
|
||||
|
||||
public static String md5(final String s) {
|
||||
try {
|
||||
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
md.update(s.getBytes(StandardCharsets.UTF_8));
|
||||
return new String(Hex.encodeHex(md.digest()));
|
||||
} catch (final Exception e) {
|
||||
System.err.println("Error creating id");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static String createDedupRecordPath(
|
||||
final String basePath, final String actionSetId, final String entityType) {
|
||||
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.text.ParseException;
|
|||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang.NullArgumentException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -34,17 +35,17 @@ public class IdGenerator implements Serializable {
|
|||
|
||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
||||
+ DedupUtility.md5(bp.get().getOriginalID());
|
||||
+ DHPUtils.md5(bp.get().getOriginalID());
|
||||
} else {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
||||
+ DedupUtility.md5(bp.get().getPid().getValue());
|
||||
+ DHPUtils.md5(bp.get().getPid().getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
|
||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||
|
||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||
return Lists
|
||||
|
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
|
@ -35,7 +37,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
|||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(StructuredProperty pidValue) {
|
||||
public void setPid(StructuredProperty pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
|
@ -91,25 +93,29 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
|||
public int compareTo(Identifier i) {
|
||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||
// alphabetical order of the originalID
|
||||
|
||||
Set<String> lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||
Set<String> rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||
|
||||
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
||||
if (entityType == EntityType.publication) {
|
||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
|
||||
return -1;
|
||||
}
|
||||
if (entityType == EntityType.dataset) {
|
||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
|
||||
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
|
||||
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
|
||||
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (this.getDate().compareTo(date) == 0) {// same date
|
||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||
|
||||
if (this.originalID.compareTo(i.originalID) > 0)
|
||||
this.useOriginal = true;
|
||||
|
@ -120,19 +126,14 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
|||
return -this.originalID.compareTo(i.originalID);
|
||||
} else
|
||||
// the minus is because we need to take the elder date
|
||||
return -this.getDate().compareTo(date);
|
||||
return -this.getDate().compareTo(i.getDate());
|
||||
} else {
|
||||
return this.getType().compareTo(i.getType());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) {
|
||||
|
||||
for (KeyValue cf : collectedFrom) {
|
||||
if (cf.getKey().equals(dsId))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||
return collectedFrom.contains(dsId);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
public enum PidType {
|
||||
|
||||
// from the less to the more important
|
||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi;
|
||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
||||
|
||||
public static PidType classidValueOf(String s) {
|
||||
try {
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.io.Serializable;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
|
@ -36,7 +37,7 @@ public class ConnectedComponent implements Serializable {
|
|||
if (docIds.size() > 1) {
|
||||
final String s = getMin();
|
||||
String prefix = s.split("\\|")[0];
|
||||
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
|
||||
ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s);
|
||||
return ccId;
|
||||
} else {
|
||||
return docIds.iterator().next();
|
||||
|
|
Loading…
Reference in New Issue