From 098c5e2f64524bd5efa469b4621575dc6fc5c756 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:34:12 +0200 Subject: [PATCH 1/6] [maven-release-plugin] prepare release dnet-dedup-3.0.15 --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index b2b0437..9fb70b0 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 34138cc..4f8aa1d 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 ../pom.xml diff --git a/pom.xml b/pom.xml index 4e9d3fe..9442f1f 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15-SNAPSHOT + 3.0.15 pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - HEAD + dnet-dedup-3.0.15 From bc7dd4bfa2f313c9635438c7c9ac2f1768422918 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 24 Oct 2019 11:34:19 +0200 Subject: [PATCH 2/6] [maven-release-plugin] prepare for next development iteration --- dnet-dedup-test/pom.xml | 2 +- dnet-pace-core/pom.xml | 2 +- pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 9fb70b0..9ca0728 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.16-SNAPSHOT ../pom.xml diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 4f8aa1d..1235dce 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.16-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 9442f1f..5721165 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.15 + 3.0.16-SNAPSHOT pom @@ -22,7 +22,7 @@ scm:git:https://github.com/dnet-team/dnet-dedup.git - dnet-dedup-3.0.15 + HEAD From dd9c62917036489f8d8726f5d92ee6c785ea2c87 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 2 Dec 2019 11:26:04 +0100 Subject: [PATCH 3/6] added gitignore props --- .gitignore | 58 +++++++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 8ccf55e..486eace 100644 --- a/.gitignore +++ b/.gitignore @@ -1,43 +1,21 @@ -*~ - -# Compiled class file -*.class - -# Log file -*.log - -# BlueJ files -*.ctxt - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - - - -*target - -# Package Files # -*.jar -*.war -*.nar -*.ear -*.zip -*.tar.gz -*.rar - - -*.idea -*.iml - .DS_Store -**/.DS_Store - -.project +.idea +*.iml +*~ .classpath +/*/.classpath +/*/*/.classpath +.metadata +/*/.metadata +/*/*/.metadata +.project +.log .settings -**/.project -**/.classpath -**/.settings - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* +/*/*/target +/*/target +/target +/*/*/build +/*/build +/build +spark-warehouse +/dhp-workflows/dhp-graph-mapper/job-override.properties From f4fa838792df00b166d067ab645bc730414f3a6b Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 2 Dec 2019 11:26:48 +0100 Subject: [PATCH 4/6] deleted useless file --- dnet-dedup.ipr | 109 ------------- dnet-dedup.iws | 418 ------------------------------------------------- 2 files changed, 527 deletions(-) delete mode 100644 dnet-dedup.ipr delete mode 100644 dnet-dedup.iws diff --git a/dnet-dedup.ipr b/dnet-dedup.ipr deleted file mode 100644 index dc3974c..0000000 --- a/dnet-dedup.ipr +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dnet-dedup.iws b/dnet-dedup.iws deleted file mode 100644 index 57de9a0..0000000 --- a/dnet-dedup.iws +++ /dev/null @@ -1,418 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From bd79999fb8c892c5880a75da323ac4cdd78375ee Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Thu, 5 Dec 2019 14:14:25 +0100 Subject: [PATCH 5/6] Improved deduplication --- dnet-dedup-test/dependency-reduced-pom.xml | 119 +++++++++++ dnet-dedup-test/pom.xml | 2 +- .../main/java/eu/dnetlib/SparkLocalTest.java | 8 +- .../resources/eu/dnetlib/pace/org.curr.conf | 11 +- dnet-pace-core/pom.xml | 21 +- .../eu/dnetlib/pace/config/DedupConfig.java | 24 +-- .../eu/dnetlib/pace/config/PaceConfig.java | 3 +- .../java/eu/dnetlib/pace/config/WfConfig.java | 23 ++- .../pace/distance/eval/ScoreResult.java | 3 +- .../eu/dnetlib/pace/model/ClusteringDef.java | 14 +- .../java/eu/dnetlib/pace/model/CondDef.java | 3 +- .../java/eu/dnetlib/pace/model/FieldConf.java | 3 +- .../eu/dnetlib/pace/util/BlockProcessor.java | 12 ++ .../eu/dnetlib/pace/util/MapDocumentUtil.java | 188 ++++++++++++++++++ .../eu/dnetlib/pace/config/ConfigTest.java | 78 +++++++- .../eu/dnetlib/pace/config/result.json | 1 + .../dnetlib/pace/config/result.pace.conf.json | 12 +- .../pace/config/result.pace.conf_jpath.json | 55 +++++ pom.xml | 37 +++- 19 files changed, 557 insertions(+), 60 deletions(-) create mode 100644 dnet-dedup-test/dependency-reduced-pom.xml create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml new file mode 100644 index 0000000..39941f7 --- /dev/null +++ b/dnet-dedup-test/dependency-reduced-pom.xml @@ -0,0 +1,119 @@ + + + + dnet-dedup + eu.dnetlib + 4.0.0-SNAPSHOT + + 4.0.0 + dnet-dedup-test + + + + maven-shade-plugin + 2.4.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + maven-deploy-plugin + 2.7 + + true + + + + maven-compiler-plugin + + 1.8 + 1.8 + + **/*.java + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + junit + junit + 4.9 + test + + + hamcrest-core + org.hamcrest + + + + + org.apache.oozie + oozie-client + 5.1.0 + test + + + json-simple + com.googlecode.json-simple + + + jms + javax.jms + + + slf4j-simple + org.slf4j + + + oozie-fluent-job-api + org.apache.oozie + + + + + + diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 9ca0728..608b536 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT ../pom.xml diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java index 2a38c6b..002186c 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java @@ -3,6 +3,7 @@ package eu.dnetlib; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.PaceUtils; import eu.dnetlib.reporter.SparkBlockProcessor; import eu.dnetlib.reporter.SparkReporter; @@ -16,14 +17,17 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import scala.Tuple2; +import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.stream.Collectors; public class SparkLocalTest { - public static void main(String[] args) { + public static void main(String[] args) throws MalformedURLException { + URL r = new URL("http://www.nlr.nl"); + System.out.println(r.getPath()); double startTime = System.currentTimeMillis(); final SparkSession spark = SparkSession @@ -44,7 +48,7 @@ public class SparkLocalTest { //create vertexes of the graph: JavaPairRDD mapDocs = dataRDD.mapToPair(it -> { - MapDocument mapDocument = PaceUtils.asMapDocument(config, it); + MapDocument mapDocument = MapDocumentUtil.asMapDocument(config, it); return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); }); diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index dfa22de..b75f38f 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":".id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" }, @@ -25,11 +26,11 @@ { "name" : "exactMatch", "fields" : [ "country" ] } ], "model" : [ - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, - { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, - { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, - { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : ".organization.metadata.country.classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : ".organization.metadata.legalshortname.value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : ".organization.metadata.legalname.value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : ".organization.metadata.websiteurl.value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".pid[] | select(.qualifier.classid == \"grid\") | .value" } ], "blacklists" : { "legalname" : [] diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 1235dce..7c9c707 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT ../pom.xml @@ -38,10 +38,6 @@ commons-collections commons-collections - - com.googlecode.protobuf-java-format - protobuf-java-format - org.antlr stringtemplate @@ -65,16 +61,23 @@ jackson-databind - - org.codehaus.jackson - jackson-mapper-asl - + org.apache.commons commons-math3 + + com.jayway.jsonpath + json-path + + + + com.arakelian + java-jq + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index f252414..1d4172e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,25 +1,23 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.util.PaceException; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.function.BiFunction; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; - -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class DedupConfig implements Config, Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 56995bb..993bfc2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,15 +1,14 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; -import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.text.Normalizer; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index ddcfaae..d74255f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,17 +1,17 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; + import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.GsonBuilder; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.map.ObjectMapper; public class WfConfig implements Serializable { @@ -76,6 +76,9 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** The Jquery path to retrieve the identifier */ + private String idPath = ".id"; + public WfConfig() {} /** @@ -245,6 +248,14 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public String getIdPath() { + return idPath; + } + + public void setIdPath(String idPath) { + this.idPath = idPath; + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java index 62b7d85..d1cf7ea 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.distance.eval; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d2dab04..c15885e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,19 +1,15 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.clustering.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index 620984f..2c047f3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -4,11 +4,12 @@ import java.io.IOException; import java.io.Serializable; import java.util.List; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.map.ObjectMapper; + public class CondDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java index 710bf10..5da1d0e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 01da9c2..3170fa6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -36,6 +36,18 @@ public class BlockProcessor { this.dedupConf = dedupConf; } + + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + //process(q, context); + process(prepare(documents), context); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + public void process(final String key, final Iterable documents, final Reporter context) { final Queue q = prepare(documents); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java new file mode 100644 index 0000000..4fbb87e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -0,0 +1,188 @@ +package eu.dnetlib.pace.util; + +import com.arakelian.jq.ImmutableJqLibrary; +import com.arakelian.jq.ImmutableJqRequest; +import com.arakelian.jq.JqLibrary; +import com.arakelian.jq.JqResponse; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import net.minidev.json.JSONArray; +import org.apache.commons.lang.StringUtils; + +import java.io.IOException; +import java.util.*; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class MapDocumentUtil { + + private static final JqLibrary library = ImmutableJqLibrary.of(); + private static final ObjectMapper mapper = new ObjectMapper(); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + + public static MapDocument asMapDocument(DedupConfig conf, final String json) { + + MapDocument m = new MapDocument(); + + final ImmutableJqRequest.Builder requestBuilder = ImmutableJqRequest.builder() // + .lib(library) // + .input(json); + m.setIdentifier(getStringValue(conf.getWf().getIdPath(), requestBuilder)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getStringValue(fdef.getPath(), requestBuilder))); + break; + case URL: + String uv = getStringValue(fdef.getPath(), requestBuilder); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getListValue(fdef.getPath(), requestBuilder) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getJPathList(fdef.getPath(), json, fdef.getType()) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + private static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath.read(json, path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { + + ((JSONArray) jresult).forEach(it -> { + + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + } + ); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + + + private static String getJPathString(final String jsonPath, final String json) { + Object o = JsonPath.read(json, jsonPath); + + if (o instanceof String) + return (String)o; + if (o instanceof JSONArray && ((JSONArray)o).size()>0) + return (String)((JSONArray)o).get(0); + return ""; + } + + private static String getStringValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); + String output = response.getOutput(); + if (StringUtils.isNotBlank(output)) { + output = output.replaceAll("\"", ""); + } + return output; + + } + + private static List getListValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + + + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); +// if (response.hasErrors()) +// throw new PaceException(String.format("Error on getting jqPath, xpath:%s, error : %s", jqPath, response.getErrors().toString())); + + List result = new ArrayList<>(); + + final JsonNode root; + try { + root = mapper.readTree(response.getOutput()); + } catch (IOException e) { + throw new PaceException("Error on parsing json", e); + } + final Iterator elements = root.elements(); + while (elements.hasNext()) { + result.add(elements.next().toString()); + } + return result; + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 9051049..44f9a7e 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,11 +1,21 @@ package eu.dnetlib.pace.config; +import com.arakelian.jq.*; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; + import org.junit.Test; +import java.util.Iterator; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { @@ -54,4 +64,70 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } + + @Test + public void testAsMapDocument() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocument(load, result); + + System.out.println(mapDocument.getFieldMap().get("dateofacceptance").stringValue()); + + } + + + @Test + public void testAsMapDocumentJPath() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); + + System.out.println(mapDocument.getFieldMap()); + + } + + + @Test + public void testJQ() throws Exception { + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + System.out.println(result); + final JqLibrary library = ImmutableJqLibrary.of(); + final JqRequest request = ImmutableJqRequest.builder() // + .lib(library) // + .input(result) // + .filter("[.entity.result.metadata.author[]]") // + .build(); + final JqResponse response = request.execute(); + ObjectMapper mapper = new ObjectMapper(); + final String output = response.getOutput(); + System.out.println(output); + final JsonNode root = mapper.readTree(output); + + System.out.println("root"+root); + + final Iterator elements = root.elements(); + while (elements.hasNext()){ + System.out.println(elements.next().toString()); + } + + + + } + + + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json new file mode 100644 index 0000000..a24be24 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json @@ -0,0 +1 @@ +{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json index 786424a..acb5b3c 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "10", "slidingWindowSize" : "200", + "idPath": ".entity.id", "rootBuilder" : [ "result" ], "includeChildren" : "true" }, @@ -25,10 +26,10 @@ { "name" : "sizeMatch", "fields" : [ "authors" ] } ], "model" : [ - { "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, - { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : ".entity.result.metadata.title[] | select(.qualifier.classid==\"main title\") | .value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.result.metadata.dateofacceptance.value" } , + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "[.entity.result.metadata.author[].fullname]" } ], "blacklists" : { "title" : [ @@ -47,7 +48,8 @@ "^(WHP Cruise Summary Information of section).*$", "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", "^(Measurement of the spin\\-dependent structure function).*" - ] } + ] } , + "synonyms": {} } } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json new file mode 100644 index 0000000..740af7f --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json @@ -0,0 +1,55 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "idPath": "$.entity.id", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.pid", "overrideMatch" : "true" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.dateofacceptance.value", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.author[*].fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } , + "synonyms": {} + } + +} diff --git a/pom.xml b/pom.xml index 5721165..d3e8121 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT pom @@ -84,6 +84,16 @@ + + + central + Central Repository + http://repo.maven.apache.org/maven2 + + true + + + @@ -269,10 +279,17 @@ - org.codehaus.jackson - jackson-mapper-asl - 1.9.13 + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + + + org.apache.commons @@ -352,6 +369,18 @@ 5.1.0 + + com.arakelian + java-jq + 0.10.1 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + From 42ffbec0612ecaddc3d0441bdd81bbc5af6f6f9f Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Fri, 6 Dec 2019 15:28:30 +0100 Subject: [PATCH 6/6] fix stuff --- .../src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 4fbb87e..27c75dd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -97,7 +97,7 @@ public class MapDocumentUtil { return m; } - private static List getJPathList(String path, String json, Type type) { + public static List getJPathList(String path, String json, Type type) { if (type == Type.List) return JsonPath.read(json, path); Object jresult; @@ -136,7 +136,7 @@ public class MapDocumentUtil { } - private static String getJPathString(final String jsonPath, final String json) { + public static String getJPathString(final String jsonPath, final String json) { Object o = JsonPath.read(json, jsonPath); if (o instanceof String)