Dedup ID creation policy #48

Manually merged
claudio.atzori merged 13 commits from deduptesting into stable_ids 2020-10-30 15:15:32 +01:00
6 changed files with 27 additions and 39 deletions
Showing only changes of commit 7093355487 - Show all commits

View File

@ -1,11 +1,8 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -18,7 +15,6 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;
@ -90,7 +86,7 @@ public class DedupRecordFactory {
T duplicate = t._2(); T duplicate = t._2();
// prepare the list of pids to use for the id generation // prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate)); bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
entity.mergeFrom(duplicate); entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {

View File

@ -70,17 +70,6 @@ public class DedupUtility {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
} }
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
return null;
}
}
public static String createDedupRecordPath( public static String createDedupRecordPath(
final String basePath, final String actionSetId, final String entityType) { final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);

View File

@ -6,6 +6,7 @@ import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang.NullArgumentException; import org.apache.commons.lang.NullArgumentException;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -34,17 +35,17 @@ public class IdGenerator implements Serializable {
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) { if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
+ DedupUtility.md5(bp.get().getOriginalID()); + DHPUtils.md5(bp.get().getOriginalID());
} else { } else {
return bp.get().getOriginalID().split("\\|")[0] + "|" return bp.get().getOriginalID().split("\\|")[0] + "|"
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
+ DedupUtility.md5(bp.get().getPid().getValue()); + DHPUtils.md5(bp.get().getPid().getValue());
} }
} }
// pick the best pid from the entity. Returns a list (length 1) to save time in the call // pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) { public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
if (entity.getPid() == null || entity.getPid().size() == 0) if (entity.getPid() == null || entity.getPid().size() == 0)
return Lists return Lists

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.dedup;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
@ -35,7 +37,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
return pid; return pid;
} }
public void setPid(StructuredProperty pidValue) { public void setPid(StructuredProperty pid) {
this.pid = pid; this.pid = pid;
} }
@ -91,25 +93,29 @@ public class Identifier implements Serializable, Comparable<Identifier> {
public int compareTo(Identifier i) { public int compareTo(Identifier i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) // priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID // alphabetical order of the originalID
Set<String> lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
Set<String> rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
if (this.getType().compareTo(i.getType()) == 0) { // same type if (this.getType().compareTo(i.getType()) == 0) { // same type
if (entityType == EntityType.publication) { if (entityType == EntityType.publication) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)) && !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
return 1; return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)) && !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
return -1; return -1;
} }
if (entityType == EntityType.dataset) { if (entityType == EntityType.dataset) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)) && !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
return 1; return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)) && !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
return -1; return -1;
} }
if (this.getDate().compareTo(date) == 0) {// same date if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) > 0) if (this.originalID.compareTo(i.originalID) > 0)
this.useOriginal = true; this.useOriginal = true;
@ -120,19 +126,14 @@ public class Identifier implements Serializable, Comparable<Identifier> {
return -this.originalID.compareTo(i.originalID); return -this.originalID.compareTo(i.originalID);
} else } else
// the minus is because we need to take the elder date // the minus is because we need to take the elder date
return -this.getDate().compareTo(date); return -this.getDate().compareTo(i.getDate());
} else { } else {
return this.getType().compareTo(i.getType()); return this.getType().compareTo(i.getType());
} }
} }
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) { public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
for (KeyValue cf : collectedFrom) {
if (cf.getKey().equals(dsId))
return true;
}
return false;
} }
} }

View File

@ -4,7 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
public enum PidType { public enum PidType {
// from the less to the more important // from the less to the more important
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi; undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
public static PidType classidValueOf(String s) { public static PidType classidValueOf(String s) {
try { try {

View File

@ -6,6 +6,7 @@ import java.io.Serializable;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;
@ -36,7 +37,7 @@ public class ConnectedComponent implements Serializable {
if (docIds.size() > 1) { if (docIds.size() > 1) {
final String s = getMin(); final String s = getMin();
String prefix = s.split("\\|")[0]; String prefix = s.split("\\|")[0];
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s);
return ccId; return ccId;
} else { } else {
return docIds.iterator().next(); return docIds.iterator().next();