From 16c670a5d5dd2bff134b7ef051472e7c270db497 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Thu, 5 Dec 2019 14:14:25 +0100 Subject: [PATCH] Improved deduplication --- dnet-pace-core/pom.xml | 21 +- .../eu/dnetlib/pace/config/DedupConfig.java | 24 +-- .../eu/dnetlib/pace/config/PaceConfig.java | 3 +- .../java/eu/dnetlib/pace/config/WfConfig.java | 23 ++- .../pace/distance/eval/ScoreResult.java | 3 +- .../eu/dnetlib/pace/model/ClusteringDef.java | 14 +- .../java/eu/dnetlib/pace/model/CondDef.java | 3 +- .../java/eu/dnetlib/pace/model/FieldConf.java | 3 +- .../eu/dnetlib/pace/util/BlockProcessor.java | 12 ++ .../eu/dnetlib/pace/util/MapDocumentUtil.java | 188 ++++++++++++++++++ .../eu/dnetlib/pace/config/ConfigTest.java | 78 +++++++- .../eu/dnetlib/pace/config/result.json | 1 + .../dnetlib/pace/config/result.pace.conf.json | 12 +- .../pace/config/result.pace.conf_jpath.json | 55 +++++ 14 files changed, 392 insertions(+), 48 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index 1235dcef7..7c9c70797 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib dnet-dedup - 3.0.16-SNAPSHOT + 4.0.0-SNAPSHOT ../pom.xml @@ -38,10 +38,6 @@ commons-collections commons-collections - - com.googlecode.protobuf-java-format - protobuf-java-format - org.antlr stringtemplate @@ -65,16 +61,23 @@ jackson-databind - - org.codehaus.jackson - jackson-mapper-asl - + org.apache.commons commons-math3 + + com.jayway.jsonpath + json-path + + + + com.arakelian + java-jq + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index f252414dd..1d4172efe 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,25 +1,23 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.util.PaceException; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.function.BiFunction; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; - -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class DedupConfig implements Config, Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 56995bb6a..993bfc23c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,15 +1,14 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.util.PaceResolver; import org.apache.commons.collections.CollectionUtils; -import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.text.Normalizer; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index ddcfaaece..d74255f51 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,17 +1,17 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; + import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.GsonBuilder; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.map.ObjectMapper; public class WfConfig implements Serializable { @@ -76,6 +76,9 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** The Jquery path to retrieve the identifier */ + private String idPath = ".id"; + public WfConfig() {} /** @@ -245,6 +248,14 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public String getIdPath() { + return idPath; + } + + public void setIdPath(String idPath) { + this.idPath = idPath; + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java index 62b7d85b4..d1cf7ea42 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.distance.eval; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d2dab04cd..c15885ecf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,19 +1,15 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.clustering.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index 620984f08..2c047f3ac 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -4,11 +4,12 @@ import java.io.IOException; import java.io.Serializable; import java.util.List; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.map.ObjectMapper; + public class CondDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java index 710bf10f8..5da1d0e6b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldConf.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 01da9c227..3170fa63f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -36,6 +36,18 @@ public class BlockProcessor { this.dedupConf = dedupConf; } + + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + //process(q, context); + process(prepare(documents), context); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + public void process(final String key, final Iterable documents, final Reporter context) { final Queue q = prepare(documents); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java new file mode 100644 index 000000000..4fbb87ee0 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -0,0 +1,188 @@ +package eu.dnetlib.pace.util; + +import com.arakelian.jq.ImmutableJqLibrary; +import com.arakelian.jq.ImmutableJqRequest; +import com.arakelian.jq.JqLibrary; +import com.arakelian.jq.JqResponse; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import net.minidev.json.JSONArray; +import org.apache.commons.lang.StringUtils; + +import java.io.IOException; +import java.util.*; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class MapDocumentUtil { + + private static final JqLibrary library = ImmutableJqLibrary.of(); + private static final ObjectMapper mapper = new ObjectMapper(); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + + public static MapDocument asMapDocument(DedupConfig conf, final String json) { + + MapDocument m = new MapDocument(); + + final ImmutableJqRequest.Builder requestBuilder = ImmutableJqRequest.builder() // + .lib(library) // + .input(json); + m.setIdentifier(getStringValue(conf.getWf().getIdPath(), requestBuilder)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getStringValue(fdef.getPath(), requestBuilder))); + break; + case URL: + String uv = getStringValue(fdef.getPath(), requestBuilder); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getListValue(fdef.getPath(), requestBuilder) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getJPathList(fdef.getPath(), json, fdef.getType()) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + private static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath.read(json, path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { + + ((JSONArray) jresult).forEach(it -> { + + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + } + ); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + + + private static String getJPathString(final String jsonPath, final String json) { + Object o = JsonPath.read(json, jsonPath); + + if (o instanceof String) + return (String)o; + if (o instanceof JSONArray && ((JSONArray)o).size()>0) + return (String)((JSONArray)o).get(0); + return ""; + } + + private static String getStringValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); + String output = response.getOutput(); + if (StringUtils.isNotBlank(output)) { + output = output.replaceAll("\"", ""); + } + return output; + + } + + private static List getListValue(final String jqPath, final ImmutableJqRequest.Builder requestBuilder) { + + + final JqResponse response = requestBuilder + .filter(jqPath) + .build() + .execute(); +// if (response.hasErrors()) +// throw new PaceException(String.format("Error on getting jqPath, xpath:%s, error : %s", jqPath, response.getErrors().toString())); + + List result = new ArrayList<>(); + + final JsonNode root; + try { + root = mapper.readTree(response.getOutput()); + } catch (IOException e) { + throw new PaceException("Error on parsing json", e); + } + final Iterator elements = root.elements(); + while (elements.hasNext()) { + result.add(elements.next().toString()); + } + return result; + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 9051049fb..44f9a7e02 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,11 +1,21 @@ package eu.dnetlib.pace.config; +import com.arakelian.jq.*; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; + import org.junit.Test; +import java.util.Iterator; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; public class ConfigTest extends AbstractPaceTest { @@ -54,4 +64,70 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } + + @Test + public void testAsMapDocument() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocument(load, result); + + System.out.println(mapDocument.getFieldMap().get("dateofacceptance").stringValue()); + + } + + + @Test + public void testAsMapDocumentJPath() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); + + System.out.println(mapDocument.getFieldMap()); + + } + + + @Test + public void testJQ() throws Exception { + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + System.out.println(result); + final JqLibrary library = ImmutableJqLibrary.of(); + final JqRequest request = ImmutableJqRequest.builder() // + .lib(library) // + .input(result) // + .filter("[.entity.result.metadata.author[]]") // + .build(); + final JqResponse response = request.execute(); + ObjectMapper mapper = new ObjectMapper(); + final String output = response.getOutput(); + System.out.println(output); + final JsonNode root = mapper.readTree(output); + + System.out.println("root"+root); + + final Iterator elements = root.elements(); + while (elements.hasNext()){ + System.out.println(elements.next().toString()); + } + + + + } + + + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json new file mode 100644 index 000000000..a24be241f --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json @@ -0,0 +1 @@ +{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json index 786424a34..acb5b3c7c 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json @@ -7,6 +7,7 @@ "queueMaxSize" : "2000", "groupMaxSize" : "10", "slidingWindowSize" : "200", + "idPath": ".entity.id", "rootBuilder" : [ "result" ], "includeChildren" : "true" }, @@ -25,10 +26,10 @@ { "name" : "sizeMatch", "fields" : [ "authors" ] } ], "model" : [ - { "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, - { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : ".entity.result.metadata.title[] | select(.qualifier.classid==\"main title\") | .value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : ".entity.result.metadata.dateofacceptance.value" } , + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "[.entity.result.metadata.author[].fullname]" } ], "blacklists" : { "title" : [ @@ -47,7 +48,8 @@ "^(WHP Cruise Summary Information of section).*$", "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", "^(Measurement of the spin\\-dependent structure function).*" - ] } + ] } , + "synonyms": {} } } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json new file mode 100644 index 000000000..740af7f0f --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json @@ -0,0 +1,55 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "idPath": "$.entity.id", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.pid", "overrideMatch" : "true" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.dateofacceptance.value", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.entity.result.metadata.author[*].fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } , + "synonyms": {} + } + +}