Dedup ID creation policy #48

Manually merged
claudio.atzori merged 13 commits from deduptesting into stable_ids 2020-10-30 15:15:32 +01:00
6 changed files with 27 additions and 39 deletions
Showing only changes of commit 7093355487 - Show all commits

View File

@ -1,11 +1,8 @@
package eu.dnetlib.dhp.oa.dedup;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
@ -18,7 +15,6 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
@ -90,7 +86,7 @@ public class DedupRecordFactory {
T duplicate = t._2();
// prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) {

View File

@ -70,17 +70,6 @@ public class DedupUtility {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
return null;
}
}
public static String createDedupRecordPath(
final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);

View File

@ -6,6 +6,7 @@ import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang.NullArgumentException;
import org.apache.commons.lang.StringUtils;
@ -34,17 +35,17 @@ public class IdGenerator implements Serializable {
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
+ DedupUtility.md5(bp.get().getOriginalID());
+ DHPUtils.md5(bp.get().getOriginalID());
} else {
return bp.get().getOriginalID().split("\\|")[0] + "|"
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
+ DedupUtility.md5(bp.get().getPid().getValue());
+ DHPUtils.md5(bp.get().getPid().getValue());
}
}
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
if (entity.getPid() == null || entity.getPid().size() == 0)
return Lists

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.Serializable;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
@ -35,7 +37,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
return pid;
}
public void setPid(StructuredProperty pidValue) {
public void setPid(StructuredProperty pid) {
this.pid = pid;
}
@ -91,25 +93,29 @@ public class Identifier implements Serializable, Comparable<Identifier> {
public int compareTo(Identifier i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
Set<String> lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
Set<String> rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
if (this.getType().compareTo(i.getType()) == 0) { // same type
if (entityType == EntityType.publication) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
return -1;
}
if (entityType == EntityType.dataset) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
return -1;
}
if (this.getDate().compareTo(date) == 0) {// same date
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) > 0)
this.useOriginal = true;
@ -120,19 +126,14 @@ public class Identifier implements Serializable, Comparable<Identifier> {
return -this.originalID.compareTo(i.originalID);
} else
// the minus is because we need to take the elder date
return -this.getDate().compareTo(date);
return -this.getDate().compareTo(i.getDate());
} else {
return this.getType().compareTo(i.getType());
}
}
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) {
for (KeyValue cf : collectedFrom) {
if (cf.getKey().equals(dsId))
return true;
}
return false;
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
}
}

View File

@ -4,7 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
public enum PidType {
// from the less to the more important
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi;
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
public static PidType classidValueOf(String s) {
try {

View File

@ -6,6 +6,7 @@ import java.io.Serializable;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
@ -36,7 +37,7 @@ public class ConnectedComponent implements Serializable {
if (docIds.size() > 1) {
final String s = getMin();
String prefix = s.split("\\|")[0];
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s);
return ccId;
} else {
return docIds.iterator().next();