forked from D-Net/dnet-hadoop
bug fix and minor changes
This commit is contained in:
parent
a2ac7e52fb
commit
7093355487
|
@ -1,11 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.text.ParseException;
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -18,7 +15,6 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -90,7 +86,7 @@ public class DedupRecordFactory {
|
||||||
T duplicate = t._2();
|
T duplicate = t._2();
|
||||||
|
|
||||||
// prepare the list of pids to use for the id generation
|
// prepare the list of pids to use for the id generation
|
||||||
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
|
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
|
||||||
|
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
|
|
|
@ -70,17 +70,6 @@ public class DedupUtility {
|
||||||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String md5(final String s) {
|
|
||||||
try {
|
|
||||||
final MessageDigest md = MessageDigest.getInstance("MD5");
|
|
||||||
md.update(s.getBytes(StandardCharsets.UTF_8));
|
|
||||||
return new String(Hex.encodeHex(md.digest()));
|
|
||||||
} catch (final Exception e) {
|
|
||||||
System.err.println("Error creating id");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String createDedupRecordPath(
|
public static String createDedupRecordPath(
|
||||||
final String basePath, final String actionSetId, final String entityType) {
|
final String basePath, final String actionSetId, final String entityType) {
|
||||||
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import org.apache.commons.lang.NullArgumentException;
|
import org.apache.commons.lang.NullArgumentException;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -34,17 +35,17 @@ public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
||||||
+ DedupUtility.md5(bp.get().getOriginalID());
|
+ DHPUtils.md5(bp.get().getOriginalID());
|
||||||
} else {
|
} else {
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
||||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
||||||
+ DedupUtility.md5(bp.get().getPid().getValue());
|
+ DHPUtils.md5(bp.get().getPid().getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||||
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
|
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||||
|
|
||||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||||
return Lists
|
return Lists
|
||||||
|
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
@ -35,7 +37,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
return pid;
|
return pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPid(StructuredProperty pidValue) {
|
public void setPid(StructuredProperty pid) {
|
||||||
this.pid = pid;
|
this.pid = pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,25 +93,29 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
public int compareTo(Identifier i) {
|
public int compareTo(Identifier i) {
|
||||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||||
// alphabetical order of the originalID
|
// alphabetical order of the originalID
|
||||||
|
|
||||||
|
Set<String> lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||||
|
Set<String> rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
||||||
|
|
||||||
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
||||||
if (entityType == EntityType.publication) {
|
if (entityType == EntityType.publication) {
|
||||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)
|
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
|
||||||
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
|
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
|
||||||
return 1;
|
return 1;
|
||||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)
|
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
|
||||||
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
|
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (entityType == EntityType.dataset) {
|
if (entityType == EntityType.dataset) {
|
||||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)
|
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
|
||||||
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
|
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
|
||||||
return 1;
|
return 1;
|
||||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)
|
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
|
||||||
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
|
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.getDate().compareTo(date) == 0) {// same date
|
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||||
|
|
||||||
if (this.originalID.compareTo(i.originalID) > 0)
|
if (this.originalID.compareTo(i.originalID) > 0)
|
||||||
this.useOriginal = true;
|
this.useOriginal = true;
|
||||||
|
@ -120,19 +126,14 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
return -this.originalID.compareTo(i.originalID);
|
return -this.originalID.compareTo(i.originalID);
|
||||||
} else
|
} else
|
||||||
// the minus is because we need to take the elder date
|
// the minus is because we need to take the elder date
|
||||||
return -this.getDate().compareTo(date);
|
return -this.getDate().compareTo(i.getDate());
|
||||||
} else {
|
} else {
|
||||||
return this.getType().compareTo(i.getType());
|
return this.getType().compareTo(i.getType());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) {
|
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||||
|
return collectedFrom.contains(dsId);
|
||||||
for (KeyValue cf : collectedFrom) {
|
|
||||||
if (cf.getKey().equals(dsId))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
public enum PidType {
|
public enum PidType {
|
||||||
|
|
||||||
// from the less to the more important
|
// from the less to the more important
|
||||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi;
|
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
||||||
|
|
||||||
public static PidType classidValueOf(String s) {
|
public static PidType classidValueOf(String s) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.io.Serializable;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
|
||||||
|
@ -36,7 +37,7 @@ public class ConnectedComponent implements Serializable {
|
||||||
if (docIds.size() > 1) {
|
if (docIds.size() > 1) {
|
||||||
final String s = getMin();
|
final String s = getMin();
|
||||||
String prefix = s.split("\\|")[0];
|
String prefix = s.split("\\|")[0];
|
||||||
ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s);
|
ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s);
|
||||||
return ccId;
|
return ccId;
|
||||||
} else {
|
} else {
|
||||||
return docIds.iterator().next();
|
return docIds.iterator().next();
|
||||||
|
|
Loading…
Reference in New Issue