From 4bcf353a72893283e66776b2be9e4cd6eb6a3997 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 9 Aug 2019 15:41:49 +0200 Subject: [PATCH] implementation of the conditions in tree nodes. get rid of the conditions part of the configuration --- README.md | 4 +- .../eu/dnetlib/pace/authors.test.pace.conf | 14 +- .../eu/dnetlib/pace/org.curr.beta.conf | 12 +- .../resources/eu/dnetlib/pace/org.curr.conf | 21 ++- .../dnetlib/pace/organization.beta.pace.conf | 8 +- .../eu/dnetlib/pace/organization.pace.conf | 14 +- .../dnetlib/pace/organization.test.pace.conf | 6 +- .../dnetlib/pace/organization.test2.pace.conf | 10 +- .../eu/dnetlib/pace/result.full.pace.conf | 14 +- .../eu/dnetlib/pace/software.pace.conf | 10 +- .../eu/dnetlib/pace/software.test.pace.conf | 14 +- .../DetectorTest.java | 0 .../DiffPatchMatch.java | 2 +- .../eu/dnetlib/pace/authors.test.pace.conf | 14 +- .../eu/dnetlib/pace/organization.pace.conf | 14 +- .../eu/dnetlib/pace/organization.test.conf | 6 +- .../eu/dnetlib/pace/result.authors.pace.conf | 4 +- .../eu/dnetlib/pace/result.full.pace.conf | 14 +- .../eu/dnetlib/pace/result.pace.conf | 6 +- .../eu/dnetlib/pace/result.prod.pace.conf | 12 +- .../pace/common/AbstractPaceFunctions.java | 3 - .../pace/condition/AbstractCondition.java | 55 -------- .../pace/condition/AlwaysTrueCondition.java | 25 ---- .../dnetlib/pace/condition/ConditionAlgo.java | 27 ---- .../pace/condition/ConditionClass.java | 13 -- .../dnetlib/pace/condition/DoiExactMatch.java | 27 ---- .../pace/condition/DomainExactMatch.java | 32 ----- .../eu/dnetlib/pace/condition/ExactMatch.java | 50 ------- .../pace/condition/ExactMatchIgnoreCase.java | 43 ------ .../pace/condition/MustBeDifferent.java | 56 -------- .../eu/dnetlib/pace/condition/PidMatch.java | 63 --------- .../eu/dnetlib/pace/condition/SizeMatch.java | 56 -------- .../pace/condition/TitleVersionMatch.java | 35 ----- .../eu/dnetlib/pace/condition/YearMatch.java | 60 --------- .../java/eu/dnetlib/pace/config/Config.java | 20 +-- .../eu/dnetlib/pace/config/DedupConfig.java | 10 -- .../eu/dnetlib/pace/config/PaceConfig.java | 47 ------- .../pace/distance/AbstractDistance.java | 15 --- .../distance/ConfigurableDistanceAlgo.java | 26 ---- .../eu/dnetlib/pace/distance/Distance.java | 8 -- .../dnetlib/pace/distance/DistanceAlgo.java | 17 --- .../dnetlib/pace/distance/DistanceClass.java | 13 -- .../pace/distance/PaceDocumentDistance.java | 12 -- .../pace/distance/PairwiseComparison.java | 125 ------------------ .../distance/SecondStringDistanceAlgo.java | 114 ---------------- .../pace/distance/algo/AlwaysMatch.java | 39 ------ .../pace/distance/algo/ExactMatch.java | 39 ------ .../pace/distance/algo/JaroWinkler.java | 44 ------ .../algo/JaroWinklerNormalizedName.java | 76 ----------- .../pace/distance/algo/JaroWinklerTitle.java | 44 ------ .../pace/distance/algo/Level2JaroWinkler.java | 34 ----- .../distance/algo/Level2JaroWinklerTitle.java | 49 ------- .../pace/distance/algo/Level2Levenstein.java | 34 ----- .../pace/distance/algo/Levenstein.java | 34 ----- .../pace/distance/algo/LevensteinTitle.java | 57 -------- .../algo/LevensteinTitleIgnoreVersion.java | 58 -------- .../pace/distance/algo/MustBeDifferent.java | 39 ------ .../pace/distance/algo/NullDistanceAlgo.java | 29 ---- .../pace/distance/algo/SortedJaroWinkler.java | 60 --------- .../algo/SortedLevel2JaroWinkler.java | 60 --------- .../algo/SortedSecondStringDistanceAlgo.java | 48 ------- .../distance/algo/SubStringLevenstein.java | 99 -------------- .../pace/distance/algo/UrlMatcher.java | 59 --------- .../pace/distance/eval/ConditionEval.java | 56 -------- .../pace/distance/eval/ConditionEvalMap.java | 38 ------ .../pace/distance/eval/DistanceEval.java | 56 -------- .../java/eu/dnetlib/pace/model/CondDef.java | 50 ------- .../java/eu/dnetlib/pace/model/FieldDef.java | 25 ---- .../eu/dnetlib/pace/tree/AlwaysMatch.java | 5 +- .../eu/dnetlib/pace/tree/DoiExactMatch.java | 28 ++++ .../dnetlib/pace/tree/DomainExactMatch.java | 29 ++++ .../pace/tree/ExactMatchIgnoreCase.java | 31 +++++ .../eu/dnetlib/pace/tree/JaroWinkler.java | 3 - .../pace/tree/JaroWinklerNormalizedName.java | 3 - .../dnetlib/pace/tree/JaroWinklerTitle.java | 2 - .../dnetlib/pace/tree/Level2JaroWinkler.java | 2 - .../pace/tree/Level2JaroWinklerTitle.java | 2 - .../dnetlib/pace/tree/Level2Levenstein.java | 2 - .../java/eu/dnetlib/pace/tree/Levenstein.java | 2 - .../eu/dnetlib/pace/tree/LevensteinTitle.java | 2 - .../tree/LevensteinTitleIgnoreVersion.java | 2 - .../eu/dnetlib/pace/tree/MustBeDifferent.java | 2 - .../dnetlib/pace/tree/NullDistanceAlgo.java | 2 - .../java/eu/dnetlib/pace/tree/PidMatch.java | 63 +++++++++ .../java/eu/dnetlib/pace/tree/SizeMatch.java | 50 +++++++ .../dnetlib/pace/tree/SortedJaroWinkler.java | 2 - .../pace/tree/SortedLevel2JaroWinkler.java | 2 - .../pace/tree/SubStringLevenstein.java | 2 - .../dnetlib/pace/tree/TitleVersionMatch.java | 39 ++++++ .../java/eu/dnetlib/pace/tree/UrlMatcher.java | 1 - .../java/eu/dnetlib/pace/tree/YearMatch.java | 50 +++++++ .../pace/tree/support/AbstractComparator.java | 8 +- .../dnetlib/pace/tree/support/Comparator.java | 4 + .../pace/tree/support/TreeNodeDef.java | 49 ++----- .../pace/tree/support/TreeNodeStats.java | 86 ++++++++++++ .../pace/tree/support/TreeProcessor.java | 58 ++++++++ .../eu/dnetlib/pace/util/BlockProcessor.java | 10 +- .../eu/dnetlib/pace/util/PaceResolver.java | 34 ----- .../DistanceAlgoTest.java | 4 +- .../dnetlib/pace/condition/ConditionTest.java | 7 - .../eu/dnetlib/pace/config/org.curr.conf | 12 +- 101 files changed, 573 insertions(+), 2303 deletions(-) rename dnet-dedup-test/src/test/java/eu/dnetlib/pace/{distance => comparators}/DetectorTest.java (100%) rename dnet-dedup-test/src/test/java/eu/dnetlib/pace/{distance => comparators}/DiffPatchMatch.java (99%) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DomainExactMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java rename dnet-pace-core/src/test/java/eu/dnetlib/pace/{distance => comparators}/DistanceAlgoTest.java (98%) delete mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java diff --git a/README.md b/README.md index 8fc52d7..5b74bd9 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The decision tree has to be defined into the json configuration. The field decis : the nodeName is the key, the treeNodeDef contains the definition of the node. In particular the TreeNodeDef contains: - - List of FieldConf : list of fields processed by the node. Each field is associated to: + - List of FieldConf : list of fieldsCount processed by the node. Each field is associated to: - field: name of the field - comparator: name of the comparator to use for that particular field, it produces a similarity score, -1 if the comparison is not possible (missing field or few informations). > Each FieldConf contains a comparator name which has to be defined. It is sufficient to implement the Comparator interface that exposes a "compare" method returning the similarity score. The new comparator must be annotated with @ComparatorClass("name") specifying the name used by the FieldConf to access to the right comparator. @@ -19,7 +19,7 @@ if score>=th --- positive result if score==-1 --- undefined result if score<\th --- negative result ``` - - aggregation: defines the type of aggregation to apply to the similarity scores of the fields in the list of fields + - aggregation: defines the type of aggregation to apply to the similarity scores of the fieldsCount in the list of fieldsCount - possible values: AVG(average), MAX, MIN, SUM - e.g. the similarity scores are multiplied with the weight and then the defined aggregation is applied - arcs: define the next node of the tree depending on the result diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf index 412299c..c27e7f7 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/authors.test.pace.conf @@ -12,16 +12,16 @@ }, "pace": { "clustering": [ - {"name": "personClustering", "fields": ["fullname"], "params": {}} + {"name": "personClustering", "fieldsCount": ["fullname"], "params": {}} ], "necessaryConditions": [], "decisionTree": { - "start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"}, - "layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"}, - "layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"}, - "layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"}, - "layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}, - "layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"} + "start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"}, + "layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"}, + "layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"}, + "layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"}, + "layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}, + "layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"} }, "model": [ {"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"}, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf index 59313bf..e2f2fc4 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf @@ -12,16 +12,16 @@ }, "pace" : { "clustering" : [ - { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } ], "sufficientConditions" : [ - { "name" : "exactMatch", "fields" : [ "gridid" ] } + { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, + { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } ], "model" : [ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index 2fe6c73..dd4c4bf 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -17,22 +17,17 @@ { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], - "sufficientConditions" : [ - { "name" : "exactMatch", "fields" : [ "gridid" ] } - ], - "necessaryConditions" : [ - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, - { "name" : "exactMatch", "fields" : [ "country" ] } - ], "decisionTree" : { - "start": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"} + "start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreMissing": "true"}, + "layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "ignoreMissing":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreMissing": "false"}, + "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"} }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid", "ignoreMissing" : "true" }, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value", "ignoreMissing" : "true" }, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7}, "ignoreMissing" : "false" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 }, "ignoreMissing" : "true" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value", "ignoreMissing" : "true" } + { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, + { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, + { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} ], "blacklists" : { "legalname" : [] diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.beta.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.beta.pace.conf index 0e0c156..a2bc7ae 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.beta.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.beta.pace.conf @@ -12,12 +12,12 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] } ], "model" : [ { "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.pace.conf index f542abb..8a1a1a8 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.pace.conf @@ -12,17 +12,17 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } }, - { "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } }, + { "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }, + { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } ], "sufficientConditions":[ - { "name" : "exactMatch", "fields" : [ "gridid" ] } + { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] } ], "model" : [ { "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test.pace.conf index 4c419b0..57db92c 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test.pace.conf @@ -12,11 +12,11 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] } ], "model" : [ { "name" : "legalname", "algo" : "Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf index 82ec993..a091f61 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf @@ -12,13 +12,13 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, + { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } ], "decisionTree": {}, "model" : [ diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf index a1c8d4a..b6c1bd6 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf @@ -12,15 +12,15 @@ }, "pace" : { "clustering" : [ - { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + { "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } ], "necessaryConditions" : [ - { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } , - { "name" : "pidMatch", "fields" : [ "pid" ] } + { "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }, + { "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } , + { "name" : "pidMatch", "fieldsCount" : [ "pid" ] } ], "model" : [ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.pace.conf index 61e5421..1dcd006 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.pace.conf @@ -14,15 +14,15 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fieldsCount" : [ "doi", "url" ], "params" : { } } ], "sufficientConditions" : [ - { "name" : "exactMatch", "fields" : [ "doi", "resulttype", "url" ] } + { "name" : "exactMatch", "fieldsCount" : [ "doi", "resulttype", "url" ] } ], "necessaryConditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] } + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] } ], "model" : [ { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" }, diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.test.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.test.pace.conf index d1f45a9..d22cd74 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.test.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/software.test.pace.conf @@ -14,17 +14,17 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }, - { "name" : "urlclustering", "fields": [ "url" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fieldsCount" : [ "doi" ], "params" : { } }, + { "name" : "urlclustering", "fieldsCount": [ "url" ], "params" : { } } ], "sufficientConditions" : [ - { "name" : "doiExactMatch", "fields": [ "doi" ] }, - { "name" : "exactMatch", "fields" : [ "url", "documentationUrl" ] } + { "name" : "doiExactMatch", "fieldsCount": [ "doi" ] }, + { "name" : "exactMatch", "fieldsCount" : [ "url", "documentationUrl" ] } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : ["resulttype"] } + { "name" : "exactMatch", "fieldsCount" : ["resulttype"] } ], "model" : [ { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" }, diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/comparators/DetectorTest.java similarity index 100% rename from dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java rename to dnet-dedup-test/src/test/java/eu/dnetlib/pace/comparators/DetectorTest.java diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DiffPatchMatch.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/comparators/DiffPatchMatch.java similarity index 99% rename from dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DiffPatchMatch.java rename to dnet-dedup-test/src/test/java/eu/dnetlib/pace/comparators/DiffPatchMatch.java index b3a0af6..fea9fb3 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DiffPatchMatch.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/comparators/DiffPatchMatch.java @@ -1,4 +1,4 @@ -package eu.dnetlib.pace.distance; +package eu.dnetlib.pace.comparators; /* * Diff Match and Patch diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/authors.test.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/authors.test.pace.conf index 412299c..c27e7f7 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/authors.test.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/authors.test.pace.conf @@ -12,16 +12,16 @@ }, "pace": { "clustering": [ - {"name": "personClustering", "fields": ["fullname"], "params": {}} + {"name": "personClustering", "fieldsCount": ["fullname"], "params": {}} ], "necessaryConditions": [], "decisionTree": { - "start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"}, - "layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"}, - "layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"}, - "layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"}, - "layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}, - "layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"} + "start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"}, + "layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"}, + "layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"}, + "layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"}, + "layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}, + "layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"} }, "model": [ {"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"}, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf index 31ad191..d8b2f00 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf @@ -12,15 +12,15 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } }, - { "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } }, + { "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }, + { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "mustBeDifferent", "fields" : [ "gridid" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, + { "name" : "mustBeDifferent", "fieldsCount" : [ "gridid" ] } ], "model" : [ { "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf index 304d32d..be3a9bf 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.test.conf @@ -12,11 +12,11 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } + { "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] } ], "model" : [ { "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf index 0f4d5ea..5b67978 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf @@ -12,8 +12,8 @@ }, "pace" : { "necessaryConditions" : [ - { "name" : "sizeMatch", "fields" : [ "authors" ] }, - { "name" : "titleVersionMatch", "fields" : [ "title" ] } + { "name" : "sizeMatch", "fieldsCount" : [ "authors" ] }, + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] } ], "model" : [ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf index 1850157..cb70c63 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf @@ -12,15 +12,15 @@ }, "pace" : { "clustering" : [ - { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + { "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } ], "necessaryConditions" : [ - { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } , - { "name" : "pidMatch", "fields" : [ "pid" ] } + { "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }, + { "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } , + { "name" : "pidMatch", "fieldsCount" : [ "pid" ] } ], "model" : [ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf index fc65b17..992d57e 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf @@ -12,11 +12,11 @@ }, "pace" : { "sufficientConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } + { "name" : "pidMatch", "fieldsCount" : [ "pid" ] } ], "necessaryConditions" : [ - { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, - { "name" : "titleVersionMatch", "fields" : [ "title" ] } + { "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] } ], "model" : [ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf index 6cc048a..7b8bb12 100644 --- a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf @@ -13,16 +13,16 @@ }, "pace" : { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } + { "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fieldsCount" : [ "doi" ], "params" : { } } ], "sufficientConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } + { "name" : "pidMatch", "fieldsCount" : [ "pid" ] } ], "necessaryConditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } + { "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }, + { "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } ], "model" : [ { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" }, diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 24379c6..e453604 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -15,10 +15,7 @@ import org.apache.commons.lang.StringUtils; import java.text.Normalizer; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Set of common functions diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java deleted file mode 100644 index 5c7b4d1..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ /dev/null @@ -1,55 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.distance.eval.ConditionEvalMap; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Abstract necessaryConditions needs a list of field names. - * - * @author claudio - * - */ -public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo { - - protected String cond; - - protected List fields; - - public AbstractCondition(final String cond, final List fields) { - this.cond = cond; - this.fields = fields; - } - - protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); - - @Override - public ConditionEvalMap verify(final Document a, final Document b) { - final ConditionEvalMap res = new ConditionEvalMap(); - for (final FieldDef fd : getFields()) { - - final Field va = a.values(fd.getName()); - final Field vb = b.values(fd.getName()); - - if (fd.isIgnoreMissing()) { - res.put(fd.getName(), verify(fd, va, vb)); - } else { - if (va.isEmpty() || vb.isEmpty()) { - res.put(fd.getName(), new ConditionEval(cond, va, vb, -1)); - } else { - res.put(fd.getName(), verify(fd, va, vb)); - } - } - } - return res; - } - - public List getFields() { - return fields; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java deleted file mode 100644 index 2274da5..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java +++ /dev/null @@ -1,25 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Default always true condition - * - * @author claudio - */ -@ConditionClass("alwaystruecondition") -public class AlwaysTrueCondition extends AbstractCondition { - - public AlwaysTrueCondition(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - return new ConditionEval(cond, a, b, 1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java deleted file mode 100644 index 1ea9caa..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ /dev/null @@ -1,27 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; -import eu.dnetlib.pace.distance.eval.ConditionEvalMap; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Allows to express general necessaryConditions to be satisfied or not between two Documents. - * - * @author claudio - */ -public interface ConditionAlgo { - - /** - * Verify a condition. - * - * @param a - * the Document a - * @param b - * the Document b - * @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when - * the condition is not verified. - */ - public abstract ConditionEvalMap verify(Document a, Document b); - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java deleted file mode 100644 index 155360c..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java +++ /dev/null @@ -1,13 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface ConditionClass { - - public String value(); -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java deleted file mode 100644 index dfdc5cd..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java +++ /dev/null @@ -1,27 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * The Class ExactMatch. - * - * @author claudio - */ -@ConditionClass("doiExactMatch") -public class DoiExactMatch extends ExactMatchIgnoreCase { - - public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - - public DoiExactMatch(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected String getValue(final Field f) { - return super.getValue(f).replaceAll(PREFIX, ""); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DomainExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DomainExactMatch.java deleted file mode 100644 index dffe2ca..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DomainExactMatch.java +++ /dev/null @@ -1,32 +0,0 @@ -package eu.dnetlib.pace.condition; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.List; - -@ConditionClass("DomainExactMatch") -public class DomainExactMatch extends ExactMatchIgnoreCase { - - public DomainExactMatch(String cond, List fields) { - super(cond, fields); - } - - @Override - protected String getValue(final Field f) { - return asUrl(super.getValue(f)).getHost(); - } - - private URL asUrl(final String value) { - try { - if (value.isEmpty()) - return new URL("http://"); - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java deleted file mode 100644 index a4cd847..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java +++ /dev/null @@ -1,50 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.lang.StringUtils; - -/** - * The Class ExactMatch. - * - * @author claudio - */ -@ConditionClass("exactMatch") -public class ExactMatch extends AbstractCondition { - - public ExactMatch(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - - final String fa = getValue(a); - final String fb = getValue(b); - - int res; - -// if (StringUtils.isBlank(fa) && StringUtils.isBlank(fb)) { -// res = 0; -// } else { -// res = fa.equals(fb) ? 1 : -1; -// } - - //if there is a blank, undefined result - if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) { - res = 0; - } else { - res = fa.equals(fb) ? 1 : -1; - } - - return new ConditionEval(cond, a, b, res); - } - - protected String getValue(final Field f) { - return getFirstValue(f); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java deleted file mode 100644 index e9925ec..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java +++ /dev/null @@ -1,43 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; -import org.apache.commons.lang.StringUtils; - -/** - * The Class ExactMatch. - * - * @author claudio - */ -@ConditionClass("exactMatchIgnoreCase") -public class ExactMatchIgnoreCase extends AbstractCondition { - - public ExactMatchIgnoreCase(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - - final String fa = getValue(a); - final String fb = getValue(b); - - int res; - - if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) { - res = 0; - } else { - res = fa.equalsIgnoreCase(fb) ? 1 : -1; - } - - return new ConditionEval(cond, a, b, res); - } - - protected String getValue(final Field f) { - return getFirstValue(f); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java deleted file mode 100644 index f2b3bdb..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java +++ /dev/null @@ -1,56 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import com.google.common.collect.Iterables; -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Returns true if the field values are different. - * - * @author claudio - */ -@ConditionClass("mustBeDifferent") -public class MustBeDifferent extends AbstractCondition { - - /** - * Instantiates a new size match. - * - * @param fields the fields - */ - public MustBeDifferent(final String cond, final List fields) { - super(cond, fields); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) - */ - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - - final String fa = getValue(a); - final String fb = getValue(b); - - return new ConditionEval(cond, a, b, fa.equals(fb) ? -1 : 1); - - } - - protected String getValue(final Field f) { - return getFirstValue(f); - } - - /** - * Checks if is empty. - * - * @param a the a - * @return true, if is empty - */ - protected boolean isEmpty(final Iterable a) { - return (a == null) || Iterables.isEmpty(a); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java deleted file mode 100644 index c15729e..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ /dev/null @@ -1,63 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.google.common.collect.Sets; -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.adaptor.Pid; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * The Class PidMatch. - * - * @author claudio - */ -@ConditionClass("pidMatch") -public class PidMatch extends AbstractCondition { - - private static final Log log = LogFactory.getLog(PidMatch.class); - - public PidMatch(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); - - final List pal = Pid.fromOafJson(sa); - final List pbl = Pid.fromOafJson(sb); - - final Set pidAset = toHashSet(pal); - final Set pidBset = toHashSet(pbl); - - int incommon = Sets.intersection(pidAset, pidBset).size(); - int simDiff = Sets.symmetricDifference(pidAset, pidBset).size(); - - if (incommon + simDiff == 0) { - return new ConditionEval(cond, a, b, 0); - } - - int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1; - - return new ConditionEval(cond, a, b, result); - } - - //lowercase + normalization of the pid before adding it to the set - private Set toHashSet(List pbl) { - - return pbl.stream() - .map(pid -> pid.getType() + normalizePid(pid.getValue())) - .collect(Collectors.toCollection(HashSet::new)); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java deleted file mode 100644 index afd0a8e..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java +++ /dev/null @@ -1,56 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import com.google.common.collect.Iterables; - -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Returns true if the number of values in the fields is the same. - * - * @author claudio - */ -@ConditionClass("sizeMatch") -public class SizeMatch extends AbstractCondition { - - /** - * Instantiates a new size match. - * - * @param fields - * the fields - */ - public SizeMatch(final String cond, final List fields) { - super(cond, fields); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) - */ - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - - // if (a.isEmpty() & b.isEmpty()) return 1; - // - // if (a.isEmpty()) return -1; - // if (b.isEmpty()) return -1; - - return new ConditionEval(cond, a, b, Iterables.size(a) == Iterables.size(b) ? 1 : -1); - } - - /** - * Checks if is empty. - * - * @param a - * the a - * @return true, if is empty - */ - protected boolean isEmpty(final Iterable a) { - return (a == null) || Iterables.isEmpty(a); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java deleted file mode 100644 index 4b94a04..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java +++ /dev/null @@ -1,35 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.util.List; - -import eu.dnetlib.pace.distance.eval.ConditionEval; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Returns true if the titles in the given documents contains the same numbers, false otherwise. - * - * @author claudio - * - */ -@ConditionClass("titleVersionMatch") -public class TitleVersionMatch extends AbstractCondition { - - public TitleVersionMatch(final String cond, final List fields) { - super(cond, fields); - } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - final String valueA = getFirstValue(a); - final String valueB = getFirstValue(b); - - return new ConditionEval(cond, a, b, notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : -1); - } - - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java deleted file mode 100644 index 71bb6cf..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java +++ /dev/null @@ -1,60 +0,0 @@ -package eu.dnetlib.pace.condition; - -import java.time.Year; -import java.util.List; - -import eu.dnetlib.pace.distance.eval.ConditionEval; -import org.apache.commons.lang.StringUtils; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. - * - * @author claudio - */ -@ConditionClass("yearMatch") -public class YearMatch extends AbstractCondition { - - private int limit = 4; - - public YearMatch(final String cond, final List fields) { - super(cond, fields); - } - - // @Override - // public boolean verify(final Document a, final Document b) { - // boolean res = true; - // for (FieldDef fd : getFields()) { - // - // } - // - // return res; - // } - - @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { - final String valueA = getNumbers(getFirstValue(a)); - final String valueB = getNumbers(getFirstValue(b)); - - final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); - final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); - - return new ConditionEval(cond, a, b, lengthMatch && valueA.equals(valueB) || onemissing ? 1 : -1); - } - - protected boolean checkLength(final String s) { - return s.length() == limit; - } - - protected String getFirstValue(final Field value) { - return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; - } - - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 1a12e69..298ff94 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -22,7 +22,11 @@ public interface Config { */ public List model(); - + /** + * Decision Tree definition + * + * @return the map representing the decision tree + */ public Map decisionTree(); /** @@ -32,20 +36,6 @@ public interface Config { */ public Map modelMap(); - /** - * Strict Pre-Condition definitions. - * - * @return the list of necessaryConditions - */ - public List sufficientConditions(); - - /** - * Pre-Condition definitions. - * - * @return the list of necessaryConditions - */ - public List necessaryConditions(); - /** * Clusterings. * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 805f5c3..f1cb9a3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -130,16 +130,6 @@ public class DedupConfig implements Config, Serializable { return getPace().getModelMap(); } - @Override - public List sufficientConditions() { - return getPace().getStrictConditionAlgos(); - } - - @Override - public List necessaryConditions() { - return getPace().getConditionAlgos(); - } - @Override public List clusterings() { return getPace().getClustering(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index d90cfe3..735af2c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,27 +1,20 @@ package eu.dnetlib.pace.config; -import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; -import org.apache.commons.collections.CollectionUtils; import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; public class PaceConfig implements Serializable { private List model; - private List sufficientConditions; - private List necessaryConditions; private List clustering; private Map decisionTree; @@ -50,32 +43,6 @@ public class PaceConfig implements Serializable { this.model = model; } - public List getSufficientConditions() { - return sufficientConditions; - } - - public void setSufficientConditions(final List sufficientConditions) { - this.sufficientConditions = sufficientConditions; - } - - public List getNecessaryConditions() { - return necessaryConditions; - } - - @JsonIgnore - public List getConditionAlgos() { - return asConditionAlgos(getNecessaryConditions()); - } - - @JsonIgnore - public List getStrictConditionAlgos() { - return asConditionAlgos(getSufficientConditions()); - } - - public void setNecessaryConditions(final List necessaryConditions) { - this.necessaryConditions = necessaryConditions; - } - public List getClustering() { return clustering; } @@ -108,18 +75,4 @@ public class PaceConfig implements Serializable { this.modelMap = modelMap; } - // helper - - private List asConditionAlgos(final List defs) { - final List algos = Lists.newArrayList(); - if (CollectionUtils.isEmpty(defs)) return algos; - for (final CondDef cd : defs) { - final List fields = getModel().stream() - .filter(fd -> cd.getFields().contains(fd.getName())) - .collect(Collectors.toList()); - algos.add(cd.conditionAlgo(fields)); - } - return algos; - } - } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java deleted file mode 100644 index 3304f36..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java +++ /dev/null @@ -1,15 +0,0 @@ -//package eu.dnetlib.pace.distance; -// -//import eu.dnetlib.pace.config.Config; -//import eu.dnetlib.pace.distance.eval.ScoreResult; -//import eu.dnetlib.pace.model.Document; -// -//public abstract class AbstractDistance implements Distance { -// -// protected abstract Document toDocument(A a); -// -// @Override -// public boolean between(final A a, final A b, final Config config) { -// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b)); -// } -//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java deleted file mode 100644 index b354f06..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java +++ /dev/null @@ -1,26 +0,0 @@ -package eu.dnetlib.pace.distance; - -import java.util.Map; - -import eu.dnetlib.pace.common.AbstractPaceFunctions; - -public abstract class ConfigurableDistanceAlgo extends AbstractPaceFunctions { - - private Map params; - - private double weigth; - - public ConfigurableDistanceAlgo(final Map params, final double weight) { - this.params = params; - this.weigth = weight; - } - - public Map getParams() { - return params; - } - - public double getWeigth() { - return weigth; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java deleted file mode 100644 index b812bd1..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java +++ /dev/null @@ -1,8 +0,0 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.config.Config; - -public interface Distance { - - public boolean between(A a, A b, Config config); -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java deleted file mode 100644 index 80b2191..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ /dev/null @@ -1,17 +0,0 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.model.Field; - -import java.util.Map; - -/** - * Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two - * objects. - */ -public interface DistanceAlgo { - - public abstract double distance(Field a, Field b); - - public double getWeight(); - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java deleted file mode 100644 index 9479fdb..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java +++ /dev/null @@ -1,13 +0,0 @@ -package eu.dnetlib.pace.distance; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface DistanceClass { - - public String value(); -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java deleted file mode 100644 index 9c75bfc..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java +++ /dev/null @@ -1,12 +0,0 @@ -//package eu.dnetlib.pace.distance; -// -//import eu.dnetlib.pace.model.Document; -// -//public class PaceDocumentDistance extends AbstractDistance { -// -// @Override -// protected Document toDocument(Document a) { -// return a; -// } -// -//} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java deleted file mode 100644 index 125919d..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PairwiseComparison.java +++ /dev/null @@ -1,125 +0,0 @@ -package eu.dnetlib.pace.distance; - -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.eval.ConditionEvalMap; -import eu.dnetlib.pace.model.*; -import eu.dnetlib.pace.tree.support.MatchType; -import eu.dnetlib.pace.tree.support.TreeNodeDef; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.Reporter; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.List; -import java.util.Map; - -/** - * The compare between two documents is given by the weighted mean of the field distances - */ -public class PairwiseComparison { - - private static final Log log = LogFactory.getLog(PairwiseComparison.class); - - private Config config; - - public PairwiseComparison(final Config config) { - this.config = config; - } - - public boolean compare(final MapDocument a, final MapDocument b) { - - //verify sufficientConditions - if (verify(a, b, config.sufficientConditions()).result() > 0) - return true; - - //verify necessaryConditions - if (verify(a, b, config.necessaryConditions()).result() < 0) - return false; - - //evaluate the decision tree - return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH; - } - - private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { - final ConditionEvalMap res = new ConditionEvalMap(); - - for (final ConditionAlgo cd : conditions) { - final ConditionEvalMap map = cd.verify(a, b); - res.mergeFrom(map); - - // commented out shortcuts - /* - if (map.anyNegative()) { - return res; - } - */ - - //if (strict && (res < 0)) return -1; - //cond += verify; - } - return res; - } - - public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map decisionTree){ - - String current = "start"; - double similarity; - - while (MatchType.parse(current)==MatchType.UNDEFINED) { - - TreeNodeDef currentNode = decisionTree.get(current); - //throw an exception if the node doesn't exist - if (currentNode == null) - throw new PaceException("The Tree Node doesn't exist: " + current); - - similarity = currentNode.evaluate(doc1, doc2); - - if (similarity == -1) { - current = currentNode.getUndefined(); - } - else if (similarity>=currentNode.getThreshold()){ - current = currentNode.getPositive(); - } - else { - current = currentNode.getNegative(); - } - - } - - return MatchType.parse(current); - } - -// private Field getValue(final Document d, final FieldDef fd) { -// final Field v = d.values(fd.getName()); -// if (fd.getLength() > 0) { -// -// if (v instanceof FieldValueImpl) { -// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength())); -// } else if (v instanceof FieldListImpl) { -// List strings = ((FieldListImpl) v).stringList(); -// strings = strings.stream() -// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) -// .map(s -> StringUtils.substring(s, 0, fd.getLength())) -// .collect(Collectors.toList()); -// ((FieldListImpl) v).clear(); -// ((FieldListImpl) v).addAll(strings.stream() -// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size()) -// .map(s -> StringUtils.substring(s, 0, fd.getLength())) -// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s)) -// .collect(Collectors.toList())); -// } -// } -// -// return v; -// } -// -// private double sumWeights(final Collection fields) { -// double sum = 0.0; -// for (final FieldDef fd : fields) { -// sum += fd.getWeight(); -// } -// return sum; -// } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java deleted file mode 100644 index d3cccce..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ /dev/null @@ -1,114 +0,0 @@ -package eu.dnetlib.pace.distance; - -import java.io.Serializable; -import java.util.List; -import java.util.Map; - -import com.wcohen.ss.AbstractStringDistance; - -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; - -/** - * For the rest of the fields delegate the compare measure to the second string library. - */ -public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo { - - // val aliases = Map(('₁' to '₉') zip ('1' to '9'): _*) ++ Map(('⁴' to '⁹') zip ('4' to '9'): _*) ++ Map('¹' -> '1', '²' -> - // '2', * '³' - // -> '3') - - /** The ssalgo. */ - protected AbstractStringDistance ssalgo; - - /** The weight. */ - protected double weight = 0.0; - - private Map params; - - protected SecondStringDistanceAlgo(Map params, final AbstractStringDistance ssalgo){ - this.params = params; - this.weight = params.get("weight").doubleValue(); - this.ssalgo = ssalgo; - } - - /** - * Instantiates a new second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected SecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) { - this.ssalgo = ssalgo; - this.weight = weight; - } - - protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){ - this.ssalgo = ssalgo; - } - - /** - * Normalize. - * - * @param d - * the d - * @return the double - */ - protected abstract double normalize(double d); - - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ - public double distance(final String a, final String b) { - double score = ssalgo.score(a, b); - return normalize(score); - } - - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ - protected double distance(final List a, final List b) { - return distance(concat(a), concat(b)); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) - */ - @Override - public double distance(final Field a, final Field b) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); - - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); - } - - /** - * To list. - * - * @param list - * the list - * @return the list - */ - protected List toList(final Field list) { - return ((FieldList) list).stringList(); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java deleted file mode 100644 index 503235c..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java +++ /dev/null @@ -1,39 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("AlwaysMatch") -public class AlwaysMatch extends SecondStringDistanceAlgo { - - public AlwaysMatch(final Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } - - public AlwaysMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - return 1.0; - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java deleted file mode 100644 index 44d881e..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java +++ /dev/null @@ -1,39 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("ExactMatch") -public class ExactMatch extends SecondStringDistanceAlgo { - - public ExactMatch(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } - - public ExactMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - return a.equals(b) ? 1.0 : 0; - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java deleted file mode 100644 index 20c0912..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java +++ /dev/null @@ -1,44 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.io.Serializable; -import java.util.Map; - -//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) -@DistanceClass("JaroWinkler") -public class JaroWinkler extends SecondStringDistanceAlgo { - - public JaroWinkler(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } - - public JaroWinkler(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(String a, String b) { - String ca = cleanup(a); - String cb = cleanup(b); - - return normalize(ssalgo.score(ca, cb)); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java deleted file mode 100644 index 546629b..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ /dev/null @@ -1,76 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; -import java.util.Set; - -@DistanceClass("JaroWinklerNormalizedName") -public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { - - private Map params; - - public JaroWinklerNormalizedName(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - this.params = params; - } - - public JaroWinklerNormalizedName(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(String a, String b) { - String ca = cleanup(a); - String cb = cleanup(b); - - ca = normalize(ca); - cb = normalize(cb); - - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); - - Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); - - Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); - Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); - - if (sameCity(cities1,cities2)) { - - if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { - - ca = removeKeywords(ca, keywords1); - ca = removeKeywords(ca, cities1); - cb = removeKeywords(cb, keywords2); - cb = removeKeywords(cb, cities2); - - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - - } - } - - return 0.0; - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java deleted file mode 100644 index ff4d6de..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java +++ /dev/null @@ -1,44 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) -@DistanceClass("JaroWinklerTitle") -public class JaroWinklerTitle extends SecondStringDistanceAlgo { - - public JaroWinklerTitle(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } - - public JaroWinklerTitle(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(String a, String b) { - String ca = cleanup(a); - String cb = cleanup(b); - - boolean check = checkNumbers(ca, cb); - return check ? 0.5 : normalize(ssalgo.score(ca, cb)); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java deleted file mode 100644 index 135fc53..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java +++ /dev/null @@ -1,34 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("Level2JaroWinkler") -public class Level2JaroWinkler extends SecondStringDistanceAlgo { - - public Level2JaroWinkler(Map params){ - super(params, new com.wcohen.ss.Level2JaroWinkler()); - } - - public Level2JaroWinkler(double w) { - super(w, new com.wcohen.ss.Level2JaroWinkler()); - } - - protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java deleted file mode 100644 index 2d05a00..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java +++ /dev/null @@ -1,49 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("Level2JaroWinklerTitle") -public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { - - public Level2JaroWinklerTitle(Map params){ - super(params, new com.wcohen.ss.Level2JaroWinkler()); - } - - public Level2JaroWinklerTitle(final double w) { - super(w, new com.wcohen.ss.Level2JaroWinkler()); - } - - protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - final String ca = cleanup(a); - final String cb = cleanup(b); - - final boolean check = checkNumbers(ca, cb); - - if (check) return 0.5; - - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return ssalgo.score(cca, ccb); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java deleted file mode 100644 index 767c597..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java +++ /dev/null @@ -1,34 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("Level2Levenstein") -public class Level2Levenstein extends SecondStringDistanceAlgo { - - public Level2Levenstein(Map params){ - super(params, new com.wcohen.ss.Level2Levenstein()); - } - - public Level2Levenstein(double w) { - super(w, new com.wcohen.ss.Level2Levenstein()); - } - - protected Level2Levenstein(double w, AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java deleted file mode 100644 index d9ba5f7..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java +++ /dev/null @@ -1,34 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("Levenstein") -public class Levenstein extends SecondStringDistanceAlgo { - - public Levenstein(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - } - - public Levenstein(double w) { - super(w, new com.wcohen.ss.Levenstein()); - } - - protected Levenstein(double w, AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java deleted file mode 100644 index ae0ef9d..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ /dev/null @@ -1,57 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.Map; - -@DistanceClass("LevensteinTitle") -public class LevensteinTitle extends SecondStringDistanceAlgo { - - private static final Log log = LogFactory.getLog(LevensteinTitle.class); - - public LevensteinTitle(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - } - - public LevensteinTitle(final double w) { - super(w, new com.wcohen.ss.Levenstein()); - } - - protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - final String ca = cleanup(a); - final String cb = cleanup(b); - - final boolean check = checkNumbers(ca, cb); - - if (check) return 0.5; - - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); - } - - private double normalize(final double score, final int la, final int lb) { - return 1 - (Math.abs(score) / Math.max(la, lb)); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java deleted file mode 100644 index 6303f8e..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ /dev/null @@ -1,58 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -/** - * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. - */ -@DistanceClass("LevensteinTitleIgnoreVersion") -public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { - - public LevensteinTitleIgnoreVersion(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - } - - public LevensteinTitleIgnoreVersion(final double w) { - super(w, new com.wcohen.ss.Levenstein()); - } - - protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) { - super(w, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - String ca = cleanup(a); - String cb = cleanup(b); - - ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim(); - cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim(); - - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); - - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); - } - - private double normalize(final double score, final int la, final int lb) { - return 1 - (Math.abs(score) / Math.max(la, lb)); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java deleted file mode 100644 index e794f02..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java +++ /dev/null @@ -1,39 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; - -import java.util.Map; - -@DistanceClass("MustBeDifferent") -public class MustBeDifferent extends SecondStringDistanceAlgo { - - public MustBeDifferent(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - } - - public MustBeDifferent(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - @Override - public double distance(final String a, final String b) { - return !a.equals(b) ? 1.0 : 0; - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java deleted file mode 100644 index c1b1d72..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ /dev/null @@ -1,29 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import eu.dnetlib.pace.distance.DistanceAlgo; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.model.Field; - -import java.util.Map; - -/** - * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a - * NullDistanceAlgo. - */ -@DistanceClass("Null") -public class NullDistanceAlgo implements DistanceAlgo { - - public NullDistanceAlgo(Map params){ - } - - @Override - public double distance(Field a, Field b) { - return 0.0; - } - - @Override - public double getWeight() { - return 0.0; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java deleted file mode 100644 index d3aa58f..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java +++ /dev/null @@ -1,60 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; - -import java.util.Map; - -/** - * The Class SortedJaroWinkler. - */ -@DistanceClass("SortedJaroWinkler") -public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { - - public SortedJaroWinkler(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - } - - /** - * Instantiates a new sorted jaro winkler. - * - * @param weight - * the weight - */ - public SortedJaroWinkler(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } - - /** - * Instantiates a new sorted jaro winkler. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() - */ - @Override - public double getWeight() { - return super.weight; - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) - */ - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java deleted file mode 100644 index 2523153..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java +++ /dev/null @@ -1,60 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; - -import java.util.Map; - -/** - * The Class SortedJaroWinkler. - */ -@DistanceClass("SortedLevel2JaroWinkler") -public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { - - /** - * Instantiates a new sorted jaro winkler. - * - * @param weight - * the weight - */ - public SortedLevel2JaroWinkler(final double weight) { - super(weight, new com.wcohen.ss.Level2JaroWinkler()); - } - - public SortedLevel2JaroWinkler(final Map params){ - super(params, new com.wcohen.ss.Level2JaroWinkler()); - } - - /** - * Instantiates a new sorted jaro winkler. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() - */ - @Override - public double getWeight() { - return super.weight; - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) - */ - @Override - protected double normalize(final double d) { - return d; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java deleted file mode 100644 index f727505..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java +++ /dev/null @@ -1,48 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Lists; -import com.wcohen.ss.AbstractStringDistance; - -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; - -/** - * For the rest of the fields delegate the compare measure to the second string library. - */ -public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { - - /** - * Instantiates a new sorted second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } - - protected SortedSecondStringDistanceAlgo(final Map params, final AbstractStringDistance ssalgo){ - super(params.get("weight").doubleValue(), ssalgo); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) - */ - @Override - protected List toList(final Field list) { - FieldList fl = (FieldList) list; - List values = Lists.newArrayList(fl.stringList()); - Collections.sort(values); - return values; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java deleted file mode 100644 index b788fad..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ /dev/null @@ -1,99 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; -import org.apache.commons.lang.StringUtils; - -import com.wcohen.ss.AbstractStringDistance; - -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; - -import java.util.Map; - -/** - * The Class SubStringLevenstein. - */ -@DistanceClass("SubStringLevenstein") -public class SubStringLevenstein extends SecondStringDistanceAlgo { - - /** The limit. */ - protected int limit; - - /** - * Instantiates a new sub string levenstein. - * - * @param w - * the w - */ - public SubStringLevenstein(final double w) { - super(w, new com.wcohen.ss.Levenstein()); - } - - public SubStringLevenstein(Map params){ - super(params, new com.wcohen.ss.Levenstein()); - this.limit = params.get("limit").intValue(); - } - - /** - * Instantiates a new sub string levenstein. - * - * @param w - * the w - * @param limit - * the limit - */ - public SubStringLevenstein(final double w, final int limit) { - super(w, new com.wcohen.ss.Levenstein()); - this.limit = limit; - } - - /** - * Instantiates a new sub string levenstein. - * - * @param w - * the w - * @param limit - * the limit - * @param ssalgo - * the ssalgo - */ - protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { - super(w, ssalgo); - this.limit = limit; - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) - */ - @Override - public double distance(final Field a, final Field b) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); - - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() - */ - @Override - public double getWeight() { - return super.weight; - } - - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) - */ - @Override - protected double normalize(final double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java deleted file mode 100644 index eacfdc0..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java +++ /dev/null @@ -1,59 +0,0 @@ -package eu.dnetlib.pace.distance.algo; - -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang.StringUtils; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Map; - -@DistanceClass("urlMatcher") -public class UrlMatcher extends Levenstein { - - private Map params; - - public UrlMatcher(Map params){ - super(params); - this.params = params; - } - - public UrlMatcher(double weight, Map params) { - super(weight); - this.params = params; - } - - public void setParams(Map params) { - this.params = params; - } - - @Override - public double distance(Field a, Field b) { - - final URL urlA = asUrl(getFirstValue(a)); - final URL urlB = asUrl(getFirstValue(b)); - - if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { - return 0.0; - } - - Double hostW = params.get("host").doubleValue(); - Double pathW = params.get("path").doubleValue(); - - if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { - return hostW * 0.5; - } - - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); - } - - private URL asUrl(final String value) { - try { - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java deleted file mode 100644 index d3fcee5..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java +++ /dev/null @@ -1,56 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import eu.dnetlib.pace.model.Field; - -/** - * Created by claudio on 09/03/16. - */ -public class ConditionEval { - - private String cond; - - private Field a; - - private Field b; - - private int result; - - public ConditionEval(final String cond, final Field a, final Field b, final int result) { - this.cond = cond; - this.a = a; - this.b = b; - this.result = result; - } - - public Field getA() { - return a; - } - - public void setA(final Field a) { - this.a = a; - } - - public Field getB() { - return b; - } - - public void setB(final Field b) { - this.b = b; - } - - public int getResult() { - return result; - } - - public void setResult(final int result) { - this.result = result; - } - - public String getCond() { - return cond; - } - - public void setCond(final String cond) { - this.cond = cond; - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java deleted file mode 100644 index a851596..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java +++ /dev/null @@ -1,38 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import java.util.HashMap; - -import com.google.common.base.Predicate; -import com.google.common.collect.Iterables; - -/** - * Created by claudio on 09/03/16. - */ -public class ConditionEvalMap extends HashMap { - - - public ConditionEvalMap mergeFrom(ConditionEvalMap map) { - putAll(map); - return this; - } - - public boolean anyNegative() { - return values().stream() - .allMatch(ec -> ec.getResult() < 0); - } - - public boolean isZero() { - return result() == 0; - } - - public int result() { - int res = 0; - for(ConditionEval ec : values()) { - final int verify = ec.getResult(); - if (verify < 0) return -1; - res += verify; - } - return res; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java deleted file mode 100644 index ef3c4da..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java +++ /dev/null @@ -1,56 +0,0 @@ -package eu.dnetlib.pace.distance.eval; - -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldDef; - -/** - * Created by claudio on 09/03/16. - */ -public class DistanceEval { - - private FieldDef fieldDef; - - private Field a; - - private Field b; - - private double distance = 0.0; - - public DistanceEval(final FieldDef fieldDef, final Field a, final Field b) { - this.fieldDef = fieldDef; - this.a = a; - this.b = b; - } - - public Field getA() { - return a; - } - - public void setA(final Field a) { - this.a = a; - } - - public Field getB() { - return b; - } - - public void setB(final Field b) { - this.b = b; - } - - public FieldDef getFieldDef() { - return fieldDef; - } - - public void setFieldDef(final FieldDef fieldDef) { - this.fieldDef = fieldDef; - } - - public double getDistance() { - return distance; - } - - public void setDistance(final double distance) { - this.distance = distance; - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java deleted file mode 100644 index 620984f..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ /dev/null @@ -1,50 +0,0 @@ -package eu.dnetlib.pace.model; - -import java.io.IOException; -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.pace.condition.*; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import eu.dnetlib.pace.util.PaceResolver; -import org.codehaus.jackson.map.ObjectMapper; - -public class CondDef implements Serializable { - - private String name; - - private List fields; - - public CondDef() {} - - public ConditionAlgo conditionAlgo(final List fields) { - return PaceConfig.resolver.getConditionAlgo(getName(), fields); - } - - public String getName() { - return name; - } - - public void setName(final String name) { - this.name = name; - } - - public List getFields() { - return fields; - } - - public void setFields(final List fields) { - this.fields = fields; - } - - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("unable to serialise " + this.getClass().getName(), e); - } - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index c76b637..12c578c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -3,15 +3,10 @@ package eu.dnetlib.pace.model; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.gson.Gson; -import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.distance.DistanceAlgo; -import eu.dnetlib.pace.util.PaceResolver; import java.io.Serializable; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. @@ -26,16 +21,6 @@ public class FieldDef implements Serializable { private Type type; - private boolean ignoreMissing; - - public boolean isIgnoreMissing() { - return ignoreMissing; - } - - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; - } - private boolean overrideMatch; /** @@ -48,8 +33,6 @@ public class FieldDef implements Serializable { */ private int length = -1; - private Map params; - public FieldDef() {} // def apply(s: String): Field[A] @@ -110,14 +93,6 @@ public class FieldDef implements Serializable { this.length = length; } - public Map getParams() { - return params; - } - - public void setParams(final Map params) { - this.params = params; - } - public void setName(String name) { this.name = name; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index 7e4b187..ff1cd97 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -1,8 +1,7 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; @@ -24,7 +23,7 @@ public class AlwaysMatch extends AbstractComparator { } @Override - public double distance(final String a, final String b) { + public double compare(final Field a, final Field b) { return 1.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java new file mode 100644 index 0000000..d3ada9a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java @@ -0,0 +1,28 @@ +package eu.dnetlib.pace.tree; + +import java.util.List; +import java.util.Map; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +/** + * The Class ExactMatch. + * + * @author claudio + */ +@ComparatorClass("doiExactMatch") +public class DoiExactMatch extends ExactMatchIgnoreCase { + + public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + + public DoiExactMatch(final Map params) { + super(params); + } + + @Override + protected String getValue(final Field f) { + return super.getValue(f).replaceAll(PREFIX, ""); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java new file mode 100644 index 0000000..4132e1e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -0,0 +1,29 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +@ComparatorClass("domainExactMatch") +public class DomainExactMatch extends ExactMatchIgnoreCase { + + public DomainExactMatch(final Map params) { + super(params); + } + + @Override + protected String getValue(final Field f) { + try { + return asUrl(super.getValue(f)).getHost(); + } catch (MalformedURLException e) { + return ""; + } + } + + private URL asUrl(final String value) throws MalformedURLException { + return new URL(value); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java new file mode 100644 index 0000000..d51a1bd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +import java.util.Map; + +@ComparatorClass("exactMatchIgnoreCase") +public class ExactMatchIgnoreCase extends AbstractComparator { + + public ExactMatchIgnoreCase(Map params) { + super(params); + } + + @Override + public double compare(Field a, Field b) { + + final String fa = getValue(a); + final String fb = getValue(b); + + if (fa.isEmpty() || fb.isEmpty()) + return -1; + + return fa.equalsIgnoreCase(fb) ? 1 : 0; + } + + protected String getValue(final Field f) { + return getFirstValue(f); + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index f2a696b..76f1fd2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -1,12 +1,9 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.io.Serializable; import java.util.Map; //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index b6b593d..85f657f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -1,9 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index 99d7a86..23c4cfa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java index d4d5b8f..7d6e33f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 41a38c1..6ddd2c9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java index 1a598ec..9d80d70 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java index ada70fd..d8706c9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 45459f4..0d444e1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; import org.apache.commons.logging.Log; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 342cee7..9413eb5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index d7251f1..4b2e707 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index 7c5c3f4..47d45a1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -1,7 +1,5 @@ package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.distance.DistanceAlgo; -import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java new file mode 100644 index 0000000..519c28d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java @@ -0,0 +1,63 @@ +package eu.dnetlib.pace.tree; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.adaptor.Pid; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +@ComparatorClass("pidMatch") +public class PidMatch extends AbstractComparator { + + private static final Log log = LogFactory.getLog(PidMatch.class); + private Map params; + + public PidMatch(final Map params) { + super(params); + this.params = params; + } + + @Override + public double compare(final Field a, final Field b) { + + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); + + final List pal = Pid.fromOafJson(sa); + final List pbl = Pid.fromOafJson(sb); + + if (pal.isEmpty() || pbl.isEmpty()) { + return -1; + } + + final Set pidAset = toHashSet(pal); + final Set pidBset = toHashSet(pbl); + + int incommon = Sets.intersection(pidAset, pidBset).size(); + int simDiff = Sets.symmetricDifference(pidAset, pidBset).size(); + + if (incommon + simDiff == 0) { + return 0.0; + } + + return (double)incommon / (incommon + simDiff) > params.getOrDefault("threshold", 0.5).doubleValue() ? 1 : 0; + + } + + //lowercase + normalization of the pid before adding it to the set + private Set toHashSet(List pbl) { + + return pbl.stream() + .map(pid -> pid.getType() + normalizePid(pid.getValue())) + .collect(Collectors.toCollection(HashSet::new)); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java new file mode 100644 index 0000000..e79f918 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.tree; + +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Iterables; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +/** + * Returns true if the number of values in the fields is the same. + * + * @author claudio + */ +@ComparatorClass("sizeMatch") +public class SizeMatch extends AbstractComparator { + + /** + * Instantiates a new size match. + * + * @param params + * the parameters + */ + public SizeMatch(final Map params) { + super(params); + } + + @Override + public double compare(final Field a, final Field b) { + + if (a.isEmpty() || b.isEmpty()) + return -1; + + return Iterables.size(a) == Iterables.size(b) ? 1 : 0; + } + + /** + * Checks if is empty. + * + * @param a + * the a + * @return true, if is empty + */ + protected boolean isEmpty(final Iterable a) { + return (a == null) || Iterables.isEmpty(a); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java index e66ad01..4b2eb83 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java index 9524140..f80a268 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -1,8 +1,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo; import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index d412f39..2ea7bd8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -2,8 +2,6 @@ package eu.dnetlib.pace.tree; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java new file mode 100644 index 0000000..6643262 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -0,0 +1,39 @@ +package eu.dnetlib.pace.tree; + +import java.util.List; +import java.util.Map; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +/** + * Returns true if the titles in the given documents contains the same numbers, false otherwise. + * + * @author claudio + * + */ +@ComparatorClass("titleVersionMatch") +public class TitleVersionMatch extends AbstractComparator { + + public TitleVersionMatch(final Map params) { + super(params); + } + + @Override + public double compare(final Field a, final Field b) { + final String valueA = getFirstValue(a); + final String valueB = getFirstValue(b); + + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; + + return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; + } + + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } + +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index d559e06..225f9ca 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.ComparatorClass; import org.apache.commons.lang.StringUtils; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java new file mode 100644 index 0000000..fbb0263 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.tree; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.lang.StringUtils; + +import java.util.Map; + +/** + * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. + * + * @author claudio + */ +@ComparatorClass("yearMatch") +public class YearMatch extends AbstractComparator { + + private int limit = 4; + + public YearMatch(final Map params) { + super(params); + } + + @Override + public double compare(final Field a, final Field b) { + final String valueA = getNumbers(getFirstValue(a)); + final String valueB = getNumbers(getFirstValue(b)); + + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; + + final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); + final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); + + return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; + } + + protected boolean checkLength(final String s) { + return s.length() == limit; + } + + protected String getFirstValue(final Field value) { + return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; + } + + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index ef9abeb..f6ad137 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -19,6 +19,10 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement private Map params; + protected AbstractComparator(Map params) { + this.params = params; + } + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ this.params = params; this.weight = 1.0; @@ -49,7 +53,9 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement * the d * @return the double */ - protected abstract double normalize(double d); + protected double normalize(double d) { + return d; + } /** * Distance. diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index ea4ad7e..64ff4f3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -4,6 +4,10 @@ import eu.dnetlib.pace.model.Field; public interface Comparator { + /* + * return : -1 -> can't decide (missing field) + * >0 -> similarity degree (depends on the algorithm) + * */ public double compare(Field a, Field b); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 4af9f81..7294536 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; @@ -36,12 +35,10 @@ public class TreeNodeDef implements Serializable { public TreeNodeDef() { } - public double evaluate(MapDocument doc1, MapDocument doc2) { + public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) { - DescriptiveStatistics stats = new DescriptiveStatistics(); - double sumWeights = 0.0; //for the weighted mean - - int missCount = 0; //counter for the number of misses + TreeNodeStats stats = new TreeNodeStats(); + stats.setFieldsCount(fields.size()); for (FieldConf fieldConf : fields) { @@ -49,40 +46,20 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); - if (result >= 0.0) { //if the field is not missing - stats.addValue(weight * result); - sumWeights += weight; //sum weights, to be used in case of weighted mean - } - else { //if the field is missing - missCount += 1; - if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored - stats.addValue(weight * 0); - sumWeights += weight; + if (result == -1) { //if the field is missing + stats.incrementMissCount(); + if (!fieldConf.isIgnoreMissing()) { + stats.incrementWeightsSum(weight); } } + else { //if the field is not missing + stats.incrementScoresSum(weight * result); + stats.incrementWeightsSum(weight); + } + } - //global ignoremissing (if one of the field is missing, return undefined) - if (!ignoreMissing && missCount>0) { - return -1; - } - - switch (aggregation){ - - case AVG: - return stats.getMean(); - case SUM: - return stats.getSum(); - case MAX: - return stats.getMax(); - case MIN: - return stats.getMin(); - case WEIGHTED_MEAN: - return stats.getSum()/sumWeights; - default: - return 0.0; - } - + return stats; } private Comparator comparator(final FieldConf field){ diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java new file mode 100644 index 0000000..1ae8996 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -0,0 +1,86 @@ +package eu.dnetlib.pace.tree.support; + +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; + +import java.io.Serializable; + +public class TreeNodeStats implements Serializable { + + private DescriptiveStatistics stats; + private int missCount = 0; + private int fieldsCount = 0; + private double weightsSum = 0.0; + + public TreeNodeStats(){ + this.stats = new DescriptiveStatistics(); + } + + public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) { + this.missCount = missCount; + this.fieldsCount = fieldsCount; + this.weightsSum = weightsSum; + } + + public DescriptiveStatistics getStats() { + return stats; + } + + public void setStats(DescriptiveStatistics stats) { + this.stats = stats; + } + + public int getMissCount() { + return missCount; + } + + public void setMissCount(int missCount) { + this.missCount = missCount; + } + + public int getFieldsCount() { + return fieldsCount; + } + + public void setFieldsCount(int fields) { + this.fieldsCount = fields; + } + + public double getWeightsSum() { + return weightsSum; + } + + public void setWeightsSum(double weightsSum) { + this.weightsSum = weightsSum; + } + + public void incrementWeightsSum(double delta){ + this.weightsSum += delta; + } + + public void incrementMissCount(){ + this.missCount += 1; + } + + public void incrementScoresSum(double delta){ + this.stats.addValue(delta); + } + + public double getFinalScore(AggType aggregation){ + + switch (aggregation){ + case AVG: + return stats.getMean(); + case SUM: + return stats.getSum(); + case MAX: + return stats.getMax(); + case MIN: + return stats.getMin(); + case WEIGHTED_MEAN: + return stats.getSum()/weightsSum; + default: + return 0.0; + } + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java new file mode 100644 index 0000000..794511a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -0,0 +1,58 @@ +package eu.dnetlib.pace.tree.support; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.*; +import eu.dnetlib.pace.util.PaceException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.Map; + +/** + * The compare between two documents is given by the weighted mean of the field distances + */ +public class TreeProcessor { + + private static final Log log = LogFactory.getLog(TreeProcessor.class); + + private Config config; + + public TreeProcessor(final Config config) { + this.config = config; + } + + public boolean compare(final MapDocument a, final MapDocument b) { + + //evaluate the decision tree + return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH; + } + + public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map decisionTree){ + + String current = "start"; + + while (MatchType.parse(current)==MatchType.UNDEFINED) { + + TreeNodeDef currentNode = decisionTree.get(current); + //throw an exception if the node doesn't exist + if (currentNode == null) + throw new PaceException("The Tree Node doesn't exist: " + current); + + TreeNodeStats stats = currentNode.evaluate(doc1, doc2); + + if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) { + current = currentNode.getUndefined(); + } + else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { + current = currentNode.getPositive(); + } + else { + current = currentNode.getNegative(); + } + + } + + return MatchType.parse(current); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index b1348e1..2dfa9ae 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -5,7 +5,7 @@ import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; //import eu.dnetlib.pace.distance.PaceDocumentDistance; -import eu.dnetlib.pace.distance.PairwiseComparison; +import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; @@ -150,14 +150,10 @@ public class BlockProcessor { if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf); + final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); - emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context); + emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); -// final ScoreResult sr = similarity(algo, pivot, curr); -//// log.info(sr.toString()+"SCORE "+ sr.getScore()); -// emitOutput(sr, idPivot, idCurr, context); -// i++; } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index c3b16c8..8c670a8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -2,31 +2,21 @@ package eu.dnetlib.pace.util; import eu.dnetlib.pace.clustering.ClusteringClass; import eu.dnetlib.pace.clustering.ClusteringFunction; -import eu.dnetlib.pace.condition.ConditionAlgo; -import eu.dnetlib.pace.condition.ConditionClass; -import eu.dnetlib.pace.distance.DistanceAlgo; -import eu.dnetlib.pace.distance.DistanceClass; -import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; import org.reflections.Reflections; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; -import java.util.List; import java.util.Map; import java.util.stream.Collectors; public class PaceResolver implements Serializable { public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); - public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition"); - public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo"); public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); private final Map> clusteringFunctions; - private final Map> conditionAlgos; - private final Map> distanceAlgos; private final Map> comparators; public PaceResolver() { @@ -35,14 +25,6 @@ public class PaceResolver implements Serializable { .filter(ClusteringFunction.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl)); - this.conditionAlgos = CONDITION_RESOLVER.getTypesAnnotatedWith(ConditionClass.class).stream() - .filter(ConditionAlgo.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class)cl)); - - this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream() - .filter(DistanceAlgo.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); - this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() .filter(Comparator.class::isAssignableFrom) .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class)cl)); @@ -56,22 +38,6 @@ public class PaceResolver implements Serializable { } } - public DistanceAlgo getDistanceAlgo(String name, Map params) throws PaceException { - try { - return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { - throw new PaceException(name + " not found ", e); - } - } - - public ConditionAlgo getConditionAlgo(String name, List fields) throws PaceException { - try { - return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { - throw new PaceException(name + " not found ", e); - } - } - public Comparator getComparator(String name, Map params) throws PaceException { try { return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java similarity index 98% rename from dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java rename to dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java index ec55b87..1004203 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java @@ -1,7 +1,7 @@ -package eu.dnetlib.pace.distance; +package eu.dnetlib.pace.comparators; import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; +import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; import org.junit.Before; import org.junit.Test; diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java deleted file mode 100644 index 57047d8..0000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/condition/ConditionTest.java +++ /dev/null @@ -1,7 +0,0 @@ -package eu.dnetlib.pace.condition; - -import eu.dnetlib.pace.AbstractPaceTest; - -public class ConditionTest extends AbstractPaceTest { - -} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index 53768c0..a5b28ce 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -12,16 +12,16 @@ }, "pace" : { "clustering" : [ - { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } ], "sufficientConditions" : [ - { "name" : "exactMatch", "fields" : [ "gridid" ] } + { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } ], "necessaryConditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, + { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } ], "model" : [ { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },