diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index a34ed9f63..5c6466dd8 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -38,10 +38,6 @@ commons-collections commons-collections - - com.googlecode.protobuf-java-format - protobuf-java-format - org.antlr stringtemplate @@ -59,22 +55,22 @@ org.reflections reflections - com.fasterxml.jackson.core jackson-databind - - - org.codehaus.jackson - jackson-mapper-asl - - org.apache.commons commons-math3 + + com.jayway.jsonpath + json-path + + + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index bcfa5a108..261e13bf5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,25 +1,25 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.util.PaceException; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.function.BiFunction; + import eu.dnetlib.pace.tree.support.TreeNodeDef; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; -import com.google.common.collect.Maps; - -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class DedupConfig implements Config, Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 7a87a82a5..33971107e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,12 +1,13 @@ package eu.dnetlib.pace.config; + +import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Maps; import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.util.List; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index d2722ac53..a79d234d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,17 +1,17 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.lang.StringUtils; + import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.List; import java.util.Set; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.GsonBuilder; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.map.ObjectMapper; public class WfConfig implements Serializable { @@ -76,12 +76,17 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** Default maximum number of iterations. */ private final static int MAX_ITERATIONS = 20; /** Maximum number of iterations */ private int maxIterations = MAX_ITERATIONS; + /** The Jquery path to retrieve the identifier */ + private String idPath = "$.id"; + + public WfConfig() {} /** @@ -252,6 +257,7 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public int getMaxIterations() { return maxIterations; } @@ -260,6 +266,15 @@ public class WfConfig implements Serializable { this.maxIterations = maxIterations; } + public String getIdPath() { + return idPath; + } + + public void setIdPath(String idPath) { + this.idPath = idPath; + + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d2dab04cd..c15885ecf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,19 +1,15 @@ package eu.dnetlib.pace.model; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + import java.io.IOException; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.clustering.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.codehaus.jackson.map.ObjectMapper; public class ClusteringDef implements Serializable { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 0d08fdd03..055eaaf18 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,7 +1,8 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index 9accded3e..b1341fcac 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -1,8 +1,9 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index cb3b7b4b0..57552e606 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,10 +1,11 @@ package eu.dnetlib.pace.tree.support; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; + import java.io.IOException; import java.io.Serializable; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java index 36188e3c3..186e8d11e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java @@ -1,7 +1,7 @@ package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.map.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.HashMap; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 2dfa9ae89..bc846e71d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -36,6 +36,18 @@ public class BlockProcessor { this.dedupConf = dedupConf; } + + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { +// log.info("reducing key: '" + key + "' records: " + q.size()); + //process(q, context); + process(prepare(documents), context); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + public void process(final String key, final Iterable documents, final Reporter context) { final Queue q = prepare(documents); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java new file mode 100644 index 000000000..20141738c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -0,0 +1,109 @@ +package eu.dnetlib.pace.util; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import net.minidev.json.JSONArray; + +import java.util.*; +import java.util.function.Predicate; + +public class MapDocumentUtil { + + + private static final ObjectMapper mapper = new ObjectMapper(); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + + + + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), getJPathString(fdef.getPath(), json))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + getJPathList(fdef.getPath(), json, fdef.getType()) + .stream() + .map(item -> new FieldValueImpl(fdef.getType(), fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + } + }); + m.setFieldMap(stringField); + return m; + } + + public static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath.read(json, path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { + + ((JSONArray) jresult).forEach(it -> { + + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + } + ); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + + + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String)o; + if (o instanceof JSONArray && ((JSONArray)o).size()>0) + return (String)((JSONArray)o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 3b87cedad..cd553baad 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,6 +1,10 @@ package eu.dnetlib.pace.config; + import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; import org.junit.Test; import java.util.Map; @@ -57,4 +61,27 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } + + + @Test + public void testAsMapDocumentJPath() throws Exception { + + DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); + + + System.out.println(load.getWf().getIdPath()); + + final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); + + System.out.println(result); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); + + System.out.println(mapDocument.getFieldMap()); + + } + + + + + } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json new file mode 100644 index 000000000..a24be241f --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json @@ -0,0 +1 @@ +{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json new file mode 100644 index 000000000..96094b814 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json @@ -0,0 +1,48 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "idPath": "$.entity.id", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "decisionTree": {}, + "model" : [ + { "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"}, + { "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"}, + { "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, + { "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } , + "synonyms": {} + } + +}