From 5c8f6febeeb75f6e9dd057d860a9136013e1f6d1 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 24 Jan 2020 10:01:11 +0100 Subject: [PATCH] minor changes in comparators --- .../pace/common/AbstractPaceFunctions.java | 19 +- .../eu/dnetlib/pace/tree/KeywordMatch.java | 2 +- .../pace/tree/support/TreeNodeDef.java | 2 +- .../pace/comparators/ComparatorTest.java | 15 +- .../eu/dnetlib/pace/config/ConfigTest.java | 19 +- .../pace/config/publication.current.conf.json | 47 +--- .../eu/dnetlib/pace/config/publication.json | 231 +++++++++++++++++- 7 files changed, 288 insertions(+), 47 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 705781e83..a4901fd53 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.io.StringWriter; import java.text.Normalizer; import java.util.*; +import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions { private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); + private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); + protected final static FieldList EMPTY_FIELD = new FieldListImpl(); protected String concat(final List l) { @@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions { } protected String cleanup(final String s) { - final String s0 = s.toLowerCase(); + final String s0 = unicodeNormalization(s.toLowerCase()); final String s1 = fixAliases(s0); final String s2 = nfd(s1); final String s3 = s2.replaceAll("–", " "); @@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions { } protected String normalize(final String s) { - return nfd(s) + return nfd(unicodeNormalization(s)) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings .replaceAll("[^ \\w]+", "") @@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions { return Normalizer.normalize(s, Normalizer.Form.NFD); } + public String unicodeNormalization(final String s) { + + Matcher m = hexUnicodePattern.matcher(s); + StringBuffer buf = new StringBuffer(s.length()); + while (m.find()) { + String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); + m.appendReplacement(buf, Matcher.quoteReplacement(ch)); + } + m.appendTail(buf); + return buf.toString(); + } + protected String filterStopWords(final String s, final Set stopwords) { final StringTokenizer st = new StringTokenizer(s); final StringBuilder sb = new StringBuilder(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index 40a90a0e1..7d275425d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator { return 1.0; else { if (codes1.isEmpty() ^ codes2.isEmpty()) - return -1; //undefined if one of the two has no keywords + return -1.0; //undefined if one of the two has no keywords return commonElementsPercentage(codes1, codes2); } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 7b13118a6..530839ccd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable { fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats( weight, - Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), + Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 7365dba75..c1ce3d883 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import org.junit.Before; import org.junit.Test; @@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions { @Test public void testCleanForSorting() { NGramUtils utils = new NGramUtils(); - System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa")); + System.out.println(utils.cleanupForOrdering("University of Pisa")); } @Test @@ -111,10 +113,15 @@ public class ComparatorTest extends AbstractPaceFunctions { double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); System.out.println("result = " + result); - } - @Test - public void jsonListMatchTest() { + result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); + System.out.println("result = " + result); + + + } + + @Test + public void jsonListMatchTest(){ } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 3b2917c43..0a8964081 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.JsonListMatch; import eu.dnetlib.pace.util.MapDocumentUtil; +import org.junit.Before; import org.junit.Test; import java.util.HashMap; @@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull; public class ConfigTest extends AbstractPaceTest { + private Map params; + + @Before + public void setup() { + params = new HashMap<>(); + params.put("jpath_value", "$.value"); + params.put("jpath_classid", "$.qualifier.classid"); + } + @Test public void dedupConfigSerializationTest() { final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json")); @@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest { @Test public void asMapDocumentTest() { - DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json")); + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); - final String json = readFromClasspath("organization.json"); + final String json = readFromClasspath("publication.json"); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); System.out.println("mapDocument = " + mapDocument.getFieldMap()); + + JsonListMatch jsonListMatch = new JsonListMatch(params); + + jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null); + } @Test diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json index 3dd1830af..78a3b4e44 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json @@ -25,37 +25,13 @@ ], "includeChildren": "true", "maxIterations": 20, - "idPath": "$.entity.id" + "idPath": "$.id" }, "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - }, - { - "name": "lowercase", - "fields": [ - "doi" - ], - "params": {} - } + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { "start": { @@ -66,14 +42,13 @@ "weight": 1.0, "countIfUndefined": "false", "params": { - "threshold": "0.5", "jpath_value": "$.value", "jpath_classid": "$.qualifier.classid" } } ], - "threshold": 1.0, - "aggregation": "MAX", + "threshold": 0.5, + "aggregation": "AVG", "positive": "MATCH", "negative": "layer2", "undefined": "layer2", @@ -97,7 +72,7 @@ } ], "threshold": 1.0, - "aggregation": "NC", + "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", @@ -107,14 +82,14 @@ "fields": [ { "field": "title", - "comparator": "LevensteinTitle", + "comparator": "levensteinTitle", "weight": 1.0, "countIfUndefined": "true", "params": {} } ], "threshold": 0.99, - "aggregation": "SUM", + "aggregation": "AVG", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -130,7 +105,7 @@ { "name": "pid", "type": "JSON", - "path": "$.pid[*]", + "path": "$.pid", "overrideMatch": "true" }, { diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json index a24be241f..9486d6ba3 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json @@ -1 +1,230 @@ -{"kind": "entity","entity": {"type": "result","result": {"metadata": {"subject": [{"value": "open access","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "infrastructure","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "data model","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "CERIF","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}},{"value": "DataCite","qualifier": {"classid": "keyword","classname": "keyword","schemeid": "dnet:subject_classification_typologies","schemename": "dnet:subject_classification_typologies"}}],"title": [{"value": "The Data Model of the OpenAIRE Scientific Communication e-Infrastructure","qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"}}],"dateofacceptance": {"value": "2012-11-30"},"publisher": {"value": ""},"resulttype": {"classid": "publication","classname": "publication","schemeid": "dnet:result_typologies","schemename": "dnet:result_typologies"},"storagedate": {"value": "2012-11-30"},"resourcetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"size": {"value": ""},"version": {"value": ""},"description": [{"value": "The OpenAIREplus project aims to further develop and operate the OpenAIRE e-infrastructure, in order to provide a central entry point to Open Access and \\tnon-Open Access publications and datasets funded by the European Commission and National agencies. The infrastructure provides the services to populate, curate, and enrich an Information Space by collecting metadata descriptions relative to organizations, data sources, projects, funding programmes, persons, publications, and datasets. Stakeholders in the research process and\\t\\t\\t\\tscientific communication, such as researchers, funding agencies, organizations nvolved in projects, project coordinators, can here find the information to improve their research and statistics to measure the impact of Open Access and funding schemes over research. In this paper, we introduce the functional requirements to be satisfied and describe the OpenAIREplus data model entities and relationships required to represent information capable of meeting them."}],"license": [{"value": ""}],"author": [{"fullname": "Manghi, Paolo","name": "Paolo","surname": "Manghi","rank": 1},{"fullname": "Houssos, Nikos","name": "Nikos","surname": "Houssos","rank": 2,"pid": [{"key": "ORCID","value": "0000-0002-3748-8359"}]},{"fullname": "Mikulicic, Marko","name": "Marko","surname": "Mikulicic","rank": 3},{"fullname": "Jf6rg, Brigitte","name": "Brigitte","surname": "Jo\u0308rg","rank": 4}]},"instance": [{"accessright": {"classid": "OPEN","classname": "Open Access","schemeid": "dnet:access_modes","schemename": "dnet:access_modes"},"instancetype": {"classid": "0017","classname": "Report","schemeid": "dnet:dataCite_resource","schemename": "dnet:dataCite_resource"},"hostedby": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"license": {"value": ""},"url": ["http://dx.doi.org/10.1007/978-3-642-35233-1_18"],"collectedfrom": {"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"},"dateofacceptance": {"value": "2012-11-30"},"distributionlocation": ""}]},"originalId": ["123456789/7","10.1007/978-3-642-35233-1_18"],"collectedfrom": [{"key": "10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","value": "4Science-DSpace-CRIS-Test"}],"pid": [{"value": "123456789/7","qualifier": {"classid": "handle","classname": "handle","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}},{"value": "10.1007/978-3-642-35233-1_18","qualifier": {"classid": "doi","classname": "doi","schemeid": "dnet:pid_types","schemename": "dnet:pid_types"}}],"dateofcollection": "2019-11-05T10:07:42.263Z","id": "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","dateoftransformation": "2019-11-06T17:11:47.505Z","oaiprovenance": {"originDescription": {"harvestDate": "2019-11-05T10:07:42.263Z","altered": true,"baseURL": "https%3A%2F%2Fdspace-cris.4science.cloud%2Foai%2Fopenairecris","identifier": "oai:dspace-cris.4science.cloud:Publications/123456789/7","datestamp": "2019-09-05T21:52:21Z","metadataNamespace": ""}}},"dataInfo": {"inferred": false,"deletedbyinference": false,"trust": "0.9","inferenceprovenance": "","provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "sysimport:crosswalk:datasetarchive","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"invisible": false}} \ No newline at end of file +{ + "journal": { + "name": "", + "issnPrinted": "", + "issnOnline": "", + "issnLinking": "", + "ep": "", + "iss": "", + "sp": "", + "vol": "", + "edition": "", + "conferenceplace": "", + "conferencedate": "", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + "author": [ + { + "fullname": "Osti Guerrazzi, Amedeo", + "name": "Amedeo", + "surname": "Osti Guerrazzi", + "rank": 1, + "pid": [], + "affiliation": [] + } + ], + "resulttype": { + "classid": "publication", + "classname": "publication", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "language": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "country": [], + "subject": [], + "title": [ + { + "value": "Rezension: Gabriele Rigano: L\u2019interprete di Auschwitz. Arminio Wachsberger un testimone d\u2019eccezione della deportazione degli ebrei di Roma (rezensiert von Amedeo Osti Guerrazzi)", + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + } + ], + "relevantdate": [], + "description": [], + "dateofacceptance": { + "value": "2018-01-01", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + "publisher": { + "value": "BSB - Bavarian State Library", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + "embargoenddate": { + "value": "", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + "source": [], + "fulltext": [], + "format": [], + "contributor": [], + "resourcetype": { + "classid": "RezensionReview", + "classname": "RezensionReview", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "coverage": [], + "refereed": { + "value": "", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + "context": [], + "id": "50|datacite____::e23cad995f89bf6cfe63dcc83d41cce9", + "originalId": [ + "http://dx.doi.org/10.15463/rec.2071701652", + "10.15463/rec.2071701652", + "https://doi.org/10.15463/rec.2071701652" + ], + "collectedfrom": [ + { + "key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", + "value": "Datacite", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + } + ], + "pid": [ + { + "value": "https://doi.org/10.15463/rec.2071701652", + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + }, + { + "value": "10.15463/rec.2071701652", + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + } + } + } + ], + "extraInfo": [], + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "inferenceprovenance": "", + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + }, + "lastupdatetimestamp": 0 +} \ No newline at end of file