diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml new file mode 100644 index 0000000..39941f7 --- /dev/null +++ b/dnet-dedup-test/dependency-reduced-pom.xml @@ -0,0 +1,119 @@ + + + + dnet-dedup + eu.dnetlib + 4.0.0-SNAPSHOT + + 4.0.0 + dnet-dedup-test + + + + maven-shade-plugin + 2.4.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + maven-deploy-plugin + 2.7 + + true + + + + maven-compiler-plugin + + 1.8 + 1.8 + + **/*.java + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + junit + junit + 4.9 + test + + + hamcrest-core + org.hamcrest + + + + + org.apache.oozie + oozie-client + 5.1.0 + test + + + json-simple + com.googlecode.json-simple + + + jms + javax.jms + + + slf4j-simple + org.slf4j + + + oozie-fluent-job-api + org.apache.oozie + + + + + + diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 9ca0728..608b536 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT ../pom.xml diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java index 2a38c6b..002186c 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java @@ -3,6 +3,7 @@ package eu.dnetlib; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.utils.PaceUtils; import eu.dnetlib.reporter.SparkBlockProcessor; import eu.dnetlib.reporter.SparkReporter; @@ -16,14 +17,17 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import scala.Tuple2; +import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.stream.Collectors; public class SparkLocalTest { - public static void main(String[] args) { + public static void main(String[] args) throws MalformedURLException { + URL r = new URL("http://www.nlr.nl"); + System.out.println(r.getPath()); double startTime = System.currentTimeMillis(); final SparkSession spark = SparkSession @@ -44,7 +48,7 @@ public class SparkLocalTest { //create vertexes of the graph: JavaPairRDD mapDocs = dataRDD.mapToPair(it -> { - MapDocument mapDocument = PaceUtils.asMapDocument(config, it); + MapDocument mapDocument = MapDocumentUtil.asMapDocument(config, it); return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); }); diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index dfa22de..b75f38f 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":".id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" }, @@ -25,11 +26,11 @@ { "name" : "exactMatch", "fields" : [ "country" ] } ], "model" : [ - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, - { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, - { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, - { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : ".organization.metadata.country.classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : ".organization.metadata.legalshortname.value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : ".organization.metadata.legalname.value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : ".organization.metadata.websiteurl.value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".pid[] | select(.qualifier.classid == \"grid\") | .value" } ], "blacklists" : { "legalname" : [] diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 1235dce..7c9c707 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT ../pom.xml @@ -38,10 +38,6 @@ commons-collections commons-collections - - com.googlecode.protobuf-java-format - protobuf-java-format - org.antlr stringtemplate @@ -65,16 +61,23 @@ jackson-databind - - org.codehaus.jackson - jackson-mapper-asl - + org.apache.commons commons-math3 + + com.jayway.jsonpath + json-path + + + + com.arakelian + java-jq + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index f252414..1d4172e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,25 +1,23 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.util.PaceException; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.function.BiFunction; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; - -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class DedupConfig implements Config, Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 56995bb..993bfc2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,15 +1,14 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; -import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.text.Normalizer; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index ddcfaae..d74255f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,17 +1,17 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; + import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.GsonBuilder; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.map.ObjectMapper; public class WfConfig implements Serializable { @@ -76,6 +76,9 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** The Jquery path to retrieve the identifier */ + private String idPath = ".id"; + public WfConfig() {} /** @@ -245,6 +248,14 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public String getIdPath() { + return idPath; + } + + public void setIdPath(String idPath) { + this.idPath = idPath; + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java index 62b7d85..d1cf7ea 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.distance.eval; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d2dab04..c15885e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,19 +1,15 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.clustering.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index 620984f..2c047f3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -4,11 +4,12 @@ import java.io.IOException; import java.io.Serializable; import java.util.List; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.map.ObjectMapper; + public class CondDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java index 710bf10..5da1d0e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 01da9c2..3170fa6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -36,6 +36,18 @@ public class BlockProcessor { this.dedupConf = dedupConf; } + + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + //process(q, context); + process(prepare(documents), context); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + public void process(final String key, final Iterable documents, final Reporter context) { final Queue q = prepare(documents); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java new file mode 100644 index 0000000..4fbb87e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -0,0 +1,188 @@ +package eu.dnetlib.pace.util; + +import com.arakelian.jq.ImmutableJqLibrary; +import com.arakelian.jq.ImmutableJqRequest; +import com.arakelian.jq.JqLibrary; +import com.arakelian.jq.JqResponse; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import net.minidev.json.JSONArray; +import org.apache.commons.lang.StringUtils; + +import java.io.IOException; +import java.util.*; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class MapDocumentUtil { + + private static final JqLibrary library = ImmutableJqLibrary.of(); + private static final ObjectMapper mapper = new ObjectMapper(); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + + public static MapDocument asMapDocument(DedupConfig conf, final String json) { + + MapDocument m = new MapDocument(); + + final ImmutableJqRequest.Builder requestBuilder = ImmutableJqRequest.builder() // + .lib(library) // + .input(json); + m.setIdentifier(getStringValue(conf.getWf().getIdPath(), requestBuilder)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getStringValue(fdef.getPath(), requestBuilder))); + break; + case URL: + String uv = getStringValue(fdef.getPath(), requestBuilder); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getListValue(fdef.getPath(), requestBuilder) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getJPathList(fdef.getPath(), json, fdef.getType()) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + private static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath.read(json, path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { + + ((JSONArray) jresult).forEach(it -> { + + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + } + ); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + + + private static String getJPathString(final String jsonPath, final String json) { + Object o = JsonPath.read(json, jsonPath); + + if (o instanceof String) + return (String)o; + if (o instanceof JSONArray && ((JSONArray)o).size()>0) + return (String)((JSONArray)o).get(0); + return ""; + } + + private static String getStringValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); + String output = response.getOutput(); + if (StringUtils.isNotBlank(output)) { + output = output.replaceAll("\"", ""); + } + return output; + + } + + private static List getListValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + + + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); +// if (response.hasErrors()) +// throw new PaceException(String.format("Error on getting jqPath, xpath:%s, error : %s", jqPath, response.getErrors().toString())); + + List result = new ArrayList<>(); + + final JsonNode root; + try { + root = mapper.readTree(response.getOutput()); + } catch (IOException e) { + throw new PaceException("Error on parsing json", e); + } + final Iterator elements = root.elements(); + while (elements.hasNext()) { + result.add(elements.next().toString()); + } + return result; + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 9051049..44f9a7e 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,11 +1,21 @@ package eu.dnetlib.pace.config; +import com.arakelian.jq.*; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; + import org.junit.Test; +import java.util.Iterator; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { @@ -54,4 +64,70 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } + + @Test + public void testAsMapDocument() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocument(load, result); + + System.out.println(mapDocument.getFieldMap().get("dateofacceptance").stringValue()); + + } + + + @Test + public void testAsMapDocumentJPath() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); + + System.out.println(mapDocument.getFieldMap()); + + } + + + @Test + public void testJQ() throws Exception { + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + System.out.println(result); + final JqLibrary library = ImmutableJqLibrary.of(); + final JqRequest request = ImmutableJqRequest.builder() // + .lib(library) // + .input(result) // + .filter("[.entity.result.metadata.author[]]") // + .build(); + final JqResponse response = request.execute(); + ObjectMapper mapper = new ObjectMapper(); + final String output = response.getOutput(); + System.out.println(output); + final JsonNode root = mapper.readTree(output); + + System.out.println("root"+root); + + final Iterator elements = root.elements(); + while (elements.hasNext()){ + System.out.println(elements.next().toString()); + } + + + + } + + + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json new file mode 100644 index 0000000..a24be24 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json @@ -0,0 +1 @@ +{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json index 786424a..acb5b3c 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "10", "slidingWindowSize" : "200", + "idPath": ".entity.id", "rootBuilder" : [ "result" ], "includeChildren" : "true" }, @@ -25,10 +26,10 @@ { "name" : "sizeMatch", "fields" : [ "authors" ] } ], "model" : [ - { "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, - { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : ".entity.result.metadata.title[] | select(.qualifier.classid==\"main title\") | .value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.result.metadata.dateofacceptance.value" } , + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "[.entity.result.metadata.author[].fullname]" } ], "blacklists" : { "title" : [ @@ -47,7 +48,8 @@ "^(WHP Cruise Summary Information of section).*$", "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", "^(Measurement of the spin\\-dependent structure function).*" - ] } + ] } , + "synonyms": {} } } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json new file mode 100644 index 0000000..740af7f --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json @@ -0,0 +1,55 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "idPath": "$.entity.id", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.pid", "overrideMatch" : "true" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.dateofacceptance.value", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.author[*].fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } , + "synonyms": {} + } + +} diff --git a/pom.xml b/pom.xml index 5721165..d3e8121 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT pom @@ -84,6 +84,16 @@ + + + central + Central Repository + http://repo.maven.apache.org/maven2 + + true + + + @@ -269,10 +279,17 @@ - org.codehaus.jackson - jackson-mapper-asl - 1.9.13 + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + + + org.apache.commons @@ -352,6 +369,18 @@ 5.1.0 + + com.arakelian + java-jq + 0.10.1 + + + com.jayway.jsonpath + json-path + 2.4.0 + + +