From 70933554874d5a69f4652ef621453e5930259028 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 6 Oct 2020 16:21:34 +0200 Subject: [PATCH] bug fix and minor changes --- .../dhp/oa/dedup/DedupRecordFactory.java | 6 +-- .../eu/dnetlib/dhp/oa/dedup/DedupUtility.java | 11 ------ .../eu/dnetlib/dhp/oa/dedup/IdGenerator.java | 7 ++-- .../eu/dnetlib/dhp/oa/dedup/Identifier.java | 37 ++++++++++--------- .../java/eu/dnetlib/dhp/oa/dedup/PidType.java | 2 +- .../oa/dedup/graph/ConnectedComponent.java | 3 +- 6 files changed, 27 insertions(+), 39 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 50dda887b6..fd37b75ee1 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,11 +1,8 @@ package eu.dnetlib.dhp.oa.dedup; -import java.text.ParseException; -import java.text.SimpleDateFormat; import java.util.*; -import org.apache.commons.lang.StringUtils; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; @@ -18,7 +15,6 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -90,7 +86,7 @@ public class DedupRecordFactory { T duplicate = t._2(); // prepare the list of pids to use for the id generation - bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate)); + bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate)); entity.mergeFrom(duplicate); if (ModelSupport.isSubClass(duplicate, Result.class)) { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index 01065510ae..a44d51af38 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -70,17 +70,6 @@ public class DedupUtility { return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); } - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes(StandardCharsets.UTF_8)); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } - public static String createDedupRecordPath( final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java index 2916e063d2..b2b81f4cb4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java @@ -6,6 +6,7 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; +import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang.NullArgumentException; import org.apache.commons.lang.StringUtils; @@ -34,17 +35,17 @@ public class IdGenerator implements Serializable { if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) { return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" - + DedupUtility.md5(bp.get().getOriginalID()); + + DHPUtils.md5(bp.get().getOriginalID()); } else { return bp.get().getOriginalID().split("\\|")[0] + "|" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" - + DedupUtility.md5(bp.get().getPid().getValue()); + + DHPUtils.md5(bp.get().getPid().getValue()); } } // pick the best pid from the entity. Returns a list (length 1) to save time in the call - public static List bestPidtoIdentifier(T entity) { + public static List bestPidToIdentifier(T entity) { if (entity.getPid() == null || entity.getPid().size() == 0) return Lists diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java index 480b523412..65441c5859 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Identifier.java @@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.Serializable; import java.util.Date; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.oaf.KeyValue; @@ -35,7 +37,7 @@ public class Identifier implements Serializable, Comparable { return pid; } - public void setPid(StructuredProperty pidValue) { + public void setPid(StructuredProperty pid) { this.pid = pid; } @@ -91,25 +93,29 @@ public class Identifier implements Serializable, Comparable { public int compareTo(Identifier i) { // priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) // alphabetical order of the originalID + + Set lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet()); + Set rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet()); + if (this.getType().compareTo(i.getType()) == 0) { // same type if (entityType == EntityType.publication) { - if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) - && !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)) + if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID) + && !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)) return 1; - if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) - && !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)) + if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID) + && !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)) return -1; } if (entityType == EntityType.dataset) { - if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) - && !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)) + if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID) + && !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)) return 1; - if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) - && !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)) + if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID) + && !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)) return -1; } - if (this.getDate().compareTo(date) == 0) {// same date + if (this.getDate().compareTo(i.getDate()) == 0) {// same date if (this.originalID.compareTo(i.originalID) > 0) this.useOriginal = true; @@ -120,19 +126,14 @@ public class Identifier implements Serializable, Comparable { return -this.originalID.compareTo(i.originalID); } else // the minus is because we need to take the elder date - return -this.getDate().compareTo(date); + return -this.getDate().compareTo(i.getDate()); } else { return this.getType().compareTo(i.getType()); } } - public boolean isFromDatasourceID(List collectedFrom, String dsId) { - - for (KeyValue cf : collectedFrom) { - if (cf.getKey().equals(dsId)) - return true; - } - return false; + public boolean isFromDatasourceID(Set collectedFrom, String dsId) { + return collectedFrom.contains(dsId); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java index c3241bac64..d644e689b3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/PidType.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.oa.dedup; public enum PidType { // from the less to the more important - undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi; + undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi; public static PidType classidValueOf(String s) { try { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index cd4f99f634..3d0d24d238 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -6,6 +6,7 @@ import java.io.Serializable; import java.util.Set; import java.util.stream.Collectors; +import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; @@ -36,7 +37,7 @@ public class ConnectedComponent implements Serializable { if (docIds.size() > 1) { final String s = getMin(); String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); + ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s); return ccId; } else { return docIds.iterator().next();