implementation of the conditions in tree nodes. get rid of the conditions part of the configuration

This commit is contained in:
miconis 2019-08-09 15:41:49 +02:00
parent 72b14ec36b
commit 4bcf353a72
101 changed files with 573 additions and 2303 deletions

View File

@ -7,7 +7,7 @@ The decision tree has to be defined into the json configuration. The field decis
<String nodeName, TreeNodeDef treeNodeDef>: the nodeName is the key, the treeNodeDef contains the definition of the node.
In particular the TreeNodeDef contains:
- List of FieldConf : list of fields processed by the node. Each field is associated to:
- List of FieldConf : list of fieldsCount processed by the node. Each field is associated to:
- field: name of the field
- comparator: name of the comparator to use for that particular field, it produces a similarity score, -1 if the comparison is not possible (missing field or few informations).
> Each FieldConf contains a comparator name which has to be defined. It is sufficient to implement the Comparator interface that exposes a "compare" method returning the similarity score. The new comparator must be annotated with @ComparatorClass("name") specifying the name used by the FieldConf to access to the right comparator.
@ -19,7 +19,7 @@ if score>=th --- positive result
if score==-1 --- undefined result
if score<\th --- negative result
```
- aggregation: defines the type of aggregation to apply to the similarity scores of the fields in the list of fields
- aggregation: defines the type of aggregation to apply to the similarity scores of the fieldsCount in the list of fieldsCount
- possible values: AVG(average), MAX, MIN, SUM
- e.g. the similarity scores are multiplied with the weight and then the defined aggregation is applied
- arcs: define the next node of the tree depending on the result

View File

@ -12,16 +12,16 @@
},
"pace": {
"clustering": [
{"name": "personClustering", "fields": ["fullname"], "params": {}}
{"name": "personClustering", "fieldsCount": ["fullname"], "params": {}}
],
"necessaryConditions": [],
"decisionTree": {
"start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
"start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
},
"model": [
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"},

View File

@ -12,16 +12,16 @@
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },

View File

@ -17,22 +17,17 @@
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
],
"decisionTree" : {
"start": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"}
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreMissing": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "ignoreMissing":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreMissing": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid", "ignoreMissing" : "true" },
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value", "ignoreMissing" : "true" },
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7}, "ignoreMissing" : "false" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 }, "ignoreMissing" : "true" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value", "ignoreMissing" : "true" }
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
],
"blacklists" : {
"legalname" : []

View File

@ -12,12 +12,12 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },

View File

@ -12,17 +12,17 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"sufficientConditions":[
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },

View File

@ -12,11 +12,11 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },

View File

@ -12,13 +12,13 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
],
"decisionTree": {},
"model" : [

View File

@ -12,15 +12,15 @@
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
{ "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] } ,
{ "name" : "pidMatch", "fields" : [ "pid" ] }
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } ,
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },

View File

@ -14,15 +14,15 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fieldsCount" : [ "doi", "url" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fields" : [ "doi", "resulttype", "url" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "doi", "resulttype", "url" ] }
],
"necessaryConditions" : [
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },

View File

@ -14,17 +14,17 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } },
{ "name" : "urlclustering", "fields": [ "url" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fieldsCount" : [ "doi" ], "params" : { } },
{ "name" : "urlclustering", "fieldsCount": [ "url" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "doiExactMatch", "fields": [ "doi" ] },
{ "name" : "exactMatch", "fields" : [ "url", "documentationUrl" ] }
{ "name" : "doiExactMatch", "fieldsCount": [ "doi" ] },
{ "name" : "exactMatch", "fieldsCount" : [ "url", "documentationUrl" ] }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : ["resulttype"] }
{ "name" : "exactMatch", "fieldsCount" : ["resulttype"] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },

View File

@ -1,4 +1,4 @@
package eu.dnetlib.pace.distance;
package eu.dnetlib.pace.comparators;
/*
* Diff Match and Patch

View File

@ -12,16 +12,16 @@
},
"pace": {
"clustering": [
{"name": "personClustering", "fields": ["fullname"], "params": {}}
{"name": "personClustering", "fieldsCount": ["fullname"], "params": {}}
],
"necessaryConditions": [],
"decisionTree": {
"start": {"fields": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fields": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fields": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fields": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fields": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fields": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
"start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
},
"model": [
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"},

View File

@ -12,15 +12,15 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "mustBeDifferent", "fields" : [ "gridid" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "mustBeDifferent", "fieldsCount" : [ "gridid" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },

View File

@ -12,11 +12,11 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] }
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },

View File

@ -12,8 +12,8 @@
},
"pace" : {
"necessaryConditions" : [
{ "name" : "sizeMatch", "fields" : [ "authors" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },

View File

@ -12,15 +12,15 @@
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
{ "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] } ,
{ "name" : "pidMatch", "fields" : [ "pid" ] }
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } ,
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },

View File

@ -12,11 +12,11 @@
},
"pace" : {
"sufficientConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },

View File

@ -13,16 +13,16 @@
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fieldsCount" : [ "doi" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"necessaryConditions" : [
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },

View File

@ -15,10 +15,7 @@ import org.apache.commons.lang.StringUtils;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Set of common functions

View File

@ -1,55 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Abstract necessaryConditions needs a list of field names.
*
* @author claudio
*
*/
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
protected String cond;
protected List<FieldDef> fields;
public AbstractCondition(final String cond, final List<FieldDef> fields) {
this.cond = cond;
this.fields = fields;
}
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
@Override
public ConditionEvalMap verify(final Document a, final Document b) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final FieldDef fd : getFields()) {
final Field va = a.values(fd.getName());
final Field vb = b.values(fd.getName());
if (fd.isIgnoreMissing()) {
res.put(fd.getName(), verify(fd, va, vb));
} else {
if (va.isEmpty() || vb.isEmpty()) {
res.put(fd.getName(), new ConditionEval(cond, va, vb, -1));
} else {
res.put(fd.getName(), verify(fd, va, vb));
}
}
}
return res;
}
public List<FieldDef> getFields() {
return fields;
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Default always true condition
*
* @author claudio
*/
@ConditionClass("alwaystruecondition")
public class AlwaysTrueCondition extends AbstractCondition {
public AlwaysTrueCondition(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
return new ConditionEval(cond, a, b, 1);
}
}

View File

@ -1,27 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.FieldDef;
/**
* Allows to express general necessaryConditions to be satisfied or not between two Documents.
*
* @author claudio
*/
public interface ConditionAlgo {
/**
* Verify a condition.
*
* @param a
* the Document a
* @param b
* the Document b
* @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when
* the condition is not verified.
*/
public abstract ConditionEvalMap verify(Document a, Document b);
}

View File

@ -1,13 +0,0 @@
package eu.dnetlib.pace.condition;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ConditionClass {
public String value();
}

View File

@ -1,27 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* The Class ExactMatch.
*
* @author claudio
*/
@ConditionClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected String getValue(final Field f) {
return super.getValue(f).replaceAll(PREFIX, "");
}
}

View File

@ -1,32 +0,0 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
@ConditionClass("DomainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase {
public DomainExactMatch(String cond, List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected String getValue(final Field f) {
return asUrl(super.getValue(f)).getHost();
}
private URL asUrl(final String value) {
try {
if (value.isEmpty())
return new URL("http://");
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
import org.apache.commons.lang.StringUtils;
/**
* The Class ExactMatch.
*
* @author claudio
*/
@ConditionClass("exactMatch")
public class ExactMatch extends AbstractCondition {
public ExactMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final String fa = getValue(a);
final String fb = getValue(b);
int res;
// if (StringUtils.isBlank(fa) && StringUtils.isBlank(fb)) {
// res = 0;
// } else {
// res = fa.equals(fb) ? 1 : -1;
// }
//if there is a blank, undefined result
if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) {
res = 0;
} else {
res = fa.equals(fb) ? 1 : -1;
}
return new ConditionEval(cond, a, b, res);
}
protected String getValue(final Field f) {
return getFirstValue(f);
}
}

View File

@ -1,43 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
import org.apache.commons.lang.StringUtils;
/**
* The Class ExactMatch.
*
* @author claudio
*/
@ConditionClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractCondition {
public ExactMatchIgnoreCase(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final String fa = getValue(a);
final String fb = getValue(b);
int res;
if (StringUtils.isBlank(fa) || StringUtils.isBlank(fb)) {
res = 0;
} else {
res = fa.equalsIgnoreCase(fb) ? 1 : -1;
}
return new ConditionEval(cond, a, b, res);
}
protected String getValue(final Field f) {
return getFirstValue(f);
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import com.google.common.collect.Iterables;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Returns true if the field values are different.
*
* @author claudio
*/
@ConditionClass("mustBeDifferent")
public class MustBeDifferent extends AbstractCondition {
/**
* Instantiates a new size match.
*
* @param fields the fields
*/
public MustBeDifferent(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
*/
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final String fa = getValue(a);
final String fb = getValue(b);
return new ConditionEval(cond, a, b, fa.equals(fb) ? -1 : 1);
}
protected String getValue(final Field f) {
return getFirstValue(f);
}
/**
* Checks if is empty.
*
* @param a the a
* @return true, if is empty
*/
protected boolean isEmpty(final Iterable<?> a) {
return (a == null) || Iterables.isEmpty(a);
}
}

View File

@ -1,63 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.adaptor.Pid;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* The Class PidMatch.
*
* @author claudio
*/
@ConditionClass("pidMatch")
public class PidMatch extends AbstractCondition {
private static final Log log = LogFactory.getLog(PidMatch.class);
public PidMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();
final List<Pid> pal = Pid.fromOafJson(sa);
final List<Pid> pbl = Pid.fromOafJson(sb);
final Set<String> pidAset = toHashSet(pal);
final Set<String> pidBset = toHashSet(pbl);
int incommon = Sets.intersection(pidAset, pidBset).size();
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
if (incommon + simDiff == 0) {
return new ConditionEval(cond, a, b, 0);
}
int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
return new ConditionEval(cond, a, b, result);
}
//lowercase + normalization of the pid before adding it to the set
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import com.google.common.collect.Iterables;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Returns true if the number of values in the fields is the same.
*
* @author claudio
*/
@ConditionClass("sizeMatch")
public class SizeMatch extends AbstractCondition {
/**
* Instantiates a new size match.
*
* @param fields
* the fields
*/
public SizeMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
*/
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
// if (a.isEmpty() & b.isEmpty()) return 1;
//
// if (a.isEmpty()) return -1;
// if (b.isEmpty()) return -1;
return new ConditionEval(cond, a, b, Iterables.size(a) == Iterables.size(b) ? 1 : -1);
}
/**
* Checks if is empty.
*
* @param a
* the a
* @return true, if is empty
*/
protected boolean isEmpty(final Iterable<?> a) {
return (a == null) || Iterables.isEmpty(a);
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
*
* @author claudio
*
*/
@ConditionClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractCondition {
public TitleVersionMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final String valueA = getFirstValue(a);
final String valueB = getFirstValue(b);
return new ConditionEval(cond, a, b, notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : -1);
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -1,60 +0,0 @@
package eu.dnetlib.pace.condition;
import java.time.Year;
import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
*
* @author claudio
*/
@ConditionClass("yearMatch")
public class YearMatch extends AbstractCondition {
private int limit = 4;
public YearMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields);
}
// @Override
// public boolean verify(final Document a, final Document b) {
// boolean res = true;
// for (FieldDef fd : getFields()) {
//
// }
//
// return res;
// }
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
return new ConditionEval(cond, a, b, lengthMatch && valueA.equals(valueB) || onemissing ? 1 : -1);
}
protected boolean checkLength(final String s) {
return s.length() == limit;
}
protected String getFirstValue(final Field value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -22,7 +22,11 @@ public interface Config {
*/
public List<FieldDef> model();
/**
* Decision Tree definition
*
* @return the map representing the decision tree
*/
public Map<String, TreeNodeDef> decisionTree();
/**
@ -32,20 +36,6 @@ public interface Config {
*/
public Map<String, FieldDef> modelMap();
/**
* Strict Pre-Condition definitions.
*
* @return the list of necessaryConditions
*/
public List<ConditionAlgo> sufficientConditions();
/**
* Pre-Condition definitions.
*
* @return the list of necessaryConditions
*/
public List<ConditionAlgo> necessaryConditions();
/**
* Clusterings.
*

View File

@ -130,16 +130,6 @@ public class DedupConfig implements Config, Serializable {
return getPace().getModelMap();
}
@Override
public List<ConditionAlgo> sufficientConditions() {
return getPace().getStrictConditionAlgos();
}
@Override
public List<ConditionAlgo> necessaryConditions() {
return getPace().getConditionAlgos();
}
@Override
public List<ClusteringDef> clusterings() {
return getPace().getClustering();

View File

@ -1,27 +1,20 @@
package eu.dnetlib.pace.config;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PaceConfig implements Serializable {
private List<FieldDef> model;
private List<CondDef> sufficientConditions;
private List<CondDef> necessaryConditions;
private List<ClusteringDef> clustering;
private Map<String, TreeNodeDef> decisionTree;
@ -50,32 +43,6 @@ public class PaceConfig implements Serializable {
this.model = model;
}
public List<CondDef> getSufficientConditions() {
return sufficientConditions;
}
public void setSufficientConditions(final List<CondDef> sufficientConditions) {
this.sufficientConditions = sufficientConditions;
}
public List<CondDef> getNecessaryConditions() {
return necessaryConditions;
}
@JsonIgnore
public List<ConditionAlgo> getConditionAlgos() {
return asConditionAlgos(getNecessaryConditions());
}
@JsonIgnore
public List<ConditionAlgo> getStrictConditionAlgos() {
return asConditionAlgos(getSufficientConditions());
}
public void setNecessaryConditions(final List<CondDef> necessaryConditions) {
this.necessaryConditions = necessaryConditions;
}
public List<ClusteringDef> getClustering() {
return clustering;
}
@ -108,18 +75,4 @@ public class PaceConfig implements Serializable {
this.modelMap = modelMap;
}
// helper
private List<ConditionAlgo> asConditionAlgos(final List<CondDef> defs) {
final List<ConditionAlgo> algos = Lists.newArrayList();
if (CollectionUtils.isEmpty(defs)) return algos;
for (final CondDef cd : defs) {
final List<FieldDef> fields = getModel().stream()
.filter(fd -> cd.getFields().contains(fd.getName()))
.collect(Collectors.toList());
algos.add(cd.conditionAlgo(fields));
}
return algos;
}
}

View File

@ -1,15 +0,0 @@
//package eu.dnetlib.pace.distance;
//
//import eu.dnetlib.pace.config.Config;
//import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.model.Document;
//
//public abstract class AbstractDistance<A> implements Distance<A> {
//
// protected abstract Document toDocument(A a);
//
// @Override
// public boolean between(final A a, final A b, final Config config) {
// return new PairwiseComparison(config).compare(toDocument(a), toDocument(b));
// }
//}

View File

@ -1,26 +0,0 @@
package eu.dnetlib.pace.distance;
import java.util.Map;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public abstract class ConfigurableDistanceAlgo extends AbstractPaceFunctions {
private Map<String, String> params;
private double weigth;
public ConfigurableDistanceAlgo(final Map<String, String> params, final double weight) {
this.params = params;
this.weigth = weight;
}
public Map<String, String> getParams() {
return params;
}
public double getWeigth() {
return weigth;
}
}

View File

@ -1,8 +0,0 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.config.Config;
public interface Distance<A> {
public boolean between(A a, A b, Config config);
}

View File

@ -1,17 +0,0 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.model.Field;
import java.util.Map;
/**
* Each field is configured with a compare algo which knows how to compute the compare (0-1) between the fields of two
* objects.
*/
public interface DistanceAlgo {
public abstract double distance(Field a, Field b);
public double getWeight();
}

View File

@ -1,13 +0,0 @@
package eu.dnetlib.pace.distance;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface DistanceClass {
public String value();
}

View File

@ -1,12 +0,0 @@
//package eu.dnetlib.pace.distance;
//
//import eu.dnetlib.pace.model.Document;
//
//public class PaceDocumentDistance extends AbstractDistance<Document> {
//
// @Override
// protected Document toDocument(Document a) {
// return a;
// }
//
//}

View File

@ -1,125 +0,0 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.tree.support.MatchType;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List;
import java.util.Map;
/**
* The compare between two documents is given by the weighted mean of the field distances
*/
public class PairwiseComparison {
private static final Log log = LogFactory.getLog(PairwiseComparison.class);
private Config config;
public PairwiseComparison(final Config config) {
this.config = config;
}
public boolean compare(final MapDocument a, final MapDocument b) {
//verify sufficientConditions
if (verify(a, b, config.sufficientConditions()).result() > 0)
return true;
//verify necessaryConditions
if (verify(a, b, config.necessaryConditions()).result() < 0)
return false;
//evaluate the decision tree
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
}
private ConditionEvalMap verify(final Document a, final Document b, final List<ConditionAlgo> conditions) {
final ConditionEvalMap res = new ConditionEvalMap();
for (final ConditionAlgo cd : conditions) {
final ConditionEvalMap map = cd.verify(a, b);
res.mergeFrom(map);
// commented out shortcuts
/*
if (map.anyNegative()) {
return res;
}
*/
//if (strict && (res < 0)) return -1;
//cond += verify;
}
return res;
}
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
String current = "start";
double similarity;
while (MatchType.parse(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
similarity = currentNode.evaluate(doc1, doc2);
if (similarity == -1) {
current = currentNode.getUndefined();
}
else if (similarity>=currentNode.getThreshold()){
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
}
return MatchType.parse(current);
}
// private Field getValue(final Document d, final FieldDef fd) {
// final Field v = d.values(fd.getName());
// if (fd.getLength() > 0) {
//
// if (v instanceof FieldValueImpl) {
// ((FieldValueImpl) v).setValue(StringUtils.substring(v.stringValue(), 0, fd.getLength()));
// } else if (v instanceof FieldListImpl) {
// List<String> strings = ((FieldListImpl) v).stringList();
// strings = strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .collect(Collectors.toList());
// ((FieldListImpl) v).clear();
// ((FieldListImpl) v).addAll(strings.stream()
// .limit(fd.getSize() > 0 ? fd.getSize() : strings.size())
// .map(s -> StringUtils.substring(s, 0, fd.getLength()))
// .map(s -> new FieldValueImpl(v.getType(), v.getName(), s))
// .collect(Collectors.toList()));
// }
// }
//
// return v;
// }
//
// private double sumWeights(final Collection<FieldDef> fields) {
// double sum = 0.0;
// for (final FieldDef fd : fields) {
// sum += fd.getWeight();
// }
// return sum;
// }
}

View File

@ -1,114 +0,0 @@
package eu.dnetlib.pace.distance;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
/**
* For the rest of the fields delegate the compare measure to the second string library.
*/
public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
// val aliases = Map(('â' to 'â') zip ('1' to '9'): _*) ++ Map(('⁴' to '⁹') zip ('4' to '9'): _*) ++ Map('¹' -> '1', '²' ->
// '2', * '³'
// -> '3')
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The weight. */
protected double weight = 0.0;
private Map<String, Number> params;
protected SecondStringDistanceAlgo(Map<String, Number> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = params.get("weight").doubleValue();
this.ssalgo = ssalgo;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected abstract double normalize(double d);
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
public double distance(final String a, final String b) {
double score = ssalgo.score(a, b);
return normalize(score);
}
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
protected double distance(final List<String> a, final List<String> b) {
return distance(concat(a), concat(b));
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final Field a, final Field b) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/**
* To list.
*
* @param list
* the list
* @return the list
*/
protected List<String> toList(final Field list) {
return ((FieldList) list).stringList();
}
}

View File

@ -1,39 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("AlwaysMatch")
public class AlwaysMatch extends SecondStringDistanceAlgo {
public AlwaysMatch(final Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,39 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("ExactMatch")
public class ExactMatch extends SecondStringDistanceAlgo {
public ExactMatch(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,44 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.io.Serializable;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@DistanceClass("JaroWinkler")
public class JaroWinkler extends SecondStringDistanceAlgo {
public JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,76 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
import java.util.Set;
@DistanceClass("JaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
private Map<String, Number> params;
public JaroWinklerNormalizedName(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,44 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@DistanceClass("JaroWinklerTitle")
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
public JaroWinklerTitle(Map<String, Number> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b) {
String ca = cleanup(a);
String cb = cleanup(b);
boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,34 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2JaroWinkler")
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
public Level2JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinkler(double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,49 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
public Level2JaroWinklerTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinklerTitle(final double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return ssalgo.score(cca, ccb);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,34 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2Levenstein")
public class Level2Levenstein extends SecondStringDistanceAlgo {
public Level2Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2Levenstein());
}
public Level2Levenstein(double w) {
super(w, new com.wcohen.ss.Level2Levenstein());
}
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,34 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Levenstein")
public class Levenstein extends SecondStringDistanceAlgo {
public Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,57 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
@DistanceClass("LevensteinTitle")
public class LevensteinTitle extends SecondStringDistanceAlgo {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,58 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@DistanceClass("LevensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitleIgnoreVersion(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,39 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("MustBeDifferent")
public class MustBeDifferent extends SecondStringDistanceAlgo {
public MustBeDifferent(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public MustBeDifferent(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b) {
return !a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,29 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
*/
@DistanceClass("Null")
public class NullDistanceAlgo implements DistanceAlgo {
public NullDistanceAlgo(Map<String, Number> params){
}
@Override
public double distance(Field a, Field b) {
return 0.0;
}
@Override
public double getWeight() {
return 0.0;
}
}

View File

@ -1,60 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@DistanceClass("SortedJaroWinkler")
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
public SortedJaroWinkler(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedJaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,60 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@DistanceClass("SortedLevel2JaroWinkler")
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedLevel2JaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,48 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
/**
* For the rest of the fields delegate the compare measure to the second string library.
*/
public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected SortedSecondStringDistanceAlgo(final Map<String, Number> params, final AbstractStringDistance ssalgo){
super(params.get("weight").doubleValue(), ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
*/
@Override
protected List<String> toList(final Field list) {
FieldList fl = (FieldList) list;
List<String> values = Lists.newArrayList(fl.stringList());
Collections.sort(values);
return values;
}
}

View File

@ -1,99 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.lang.StringUtils;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import java.util.Map;
/**
* The Class SubStringLevenstein.
*/
@DistanceClass("SubStringLevenstein")
public class SubStringLevenstein extends SecondStringDistanceAlgo {
/** The limit. */
protected int limit;
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, Number> params){
super(params, new com.wcohen.ss.Levenstein());
this.limit = params.get("limit").intValue();
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w
* the w
* @param limit
* the limit
* @param ssalgo
* the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final Field a, final Field b) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,59 +0,0 @@
package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@DistanceClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map<String, Number> params;
public UrlMatcher(Map<String, Number> params){
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, Number> params) {
super(weight);
this.params = params;
}
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override
public double distance(Field a, Field b) {
final URL urlA = asUrl(getFirstValue(a));
final URL urlB = asUrl(getFirstValue(b));
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
Double hostW = params.get("host").doubleValue();
Double pathW = params.get("path").doubleValue();
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.model.Field;
/**
* Created by claudio on 09/03/16.
*/
public class ConditionEval {
private String cond;
private Field a;
private Field b;
private int result;
public ConditionEval(final String cond, final Field a, final Field b, final int result) {
this.cond = cond;
this.a = a;
this.b = b;
this.result = result;
}
public Field getA() {
return a;
}
public void setA(final Field a) {
this.a = a;
}
public Field getB() {
return b;
}
public void setB(final Field b) {
this.b = b;
}
public int getResult() {
return result;
}
public void setResult(final int result) {
this.result = result;
}
public String getCond() {
return cond;
}
public void setCond(final String cond) {
this.cond = cond;
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import java.util.HashMap;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
/**
* Created by claudio on 09/03/16.
*/
public class ConditionEvalMap extends HashMap<String, ConditionEval> {
public ConditionEvalMap mergeFrom(ConditionEvalMap map) {
putAll(map);
return this;
}
public boolean anyNegative() {
return values().stream()
.allMatch(ec -> ec.getResult() < 0);
}
public boolean isZero() {
return result() == 0;
}
public int result() {
int res = 0;
for(ConditionEval ec : values()) {
final int verify = ec.getResult();
if (verify < 0) return -1;
res += verify;
}
return res;
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
/**
* Created by claudio on 09/03/16.
*/
public class DistanceEval {
private FieldDef fieldDef;
private Field a;
private Field b;
private double distance = 0.0;
public DistanceEval(final FieldDef fieldDef, final Field a, final Field b) {
this.fieldDef = fieldDef;
this.a = a;
this.b = b;
}
public Field getA() {
return a;
}
public void setA(final Field a) {
this.a = a;
}
public Field getB() {
return b;
}
public void setB(final Field b) {
this.b = b;
}
public FieldDef getFieldDef() {
return fieldDef;
}
public void setFieldDef(final FieldDef fieldDef) {
this.fieldDef = fieldDef;
}
public double getDistance() {
return distance;
}
public void setDistance(final double distance) {
this.distance = distance;
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.pace.condition.*;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import eu.dnetlib.pace.util.PaceResolver;
import org.codehaus.jackson.map.ObjectMapper;
public class CondDef implements Serializable {
private String name;
private List<String> fields;
public CondDef() {}
public ConditionAlgo conditionAlgo(final List<FieldDef> fields) {
return PaceConfig.resolver.getConditionAlgo(getName(), fields);
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public List<String> getFields() {
return fields;
}
public void setFields(final List<String> fields) {
this.fields = fields;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
}
}
}

View File

@ -3,15 +3,10 @@ package eu.dnetlib.pace.model;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.util.PaceResolver;
import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
@ -26,16 +21,6 @@ public class FieldDef implements Serializable {
private Type type;
private boolean ignoreMissing;
public boolean isIgnoreMissing() {
return ignoreMissing;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
private boolean overrideMatch;
/**
@ -48,8 +33,6 @@ public class FieldDef implements Serializable {
*/
private int length = -1;
private Map<String, Number> params;
public FieldDef() {}
// def apply(s: String): Field[A]
@ -110,14 +93,6 @@ public class FieldDef implements Serializable {
this.length = length;
}
public Map<String, Number> getParams() {
return params;
}
public void setParams(final Map<String, Number> params) {
this.params = params;
}
public void setName(String name) {
this.name = name;
}

View File

@ -1,8 +1,7 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -24,7 +23,7 @@ public class AlwaysMatch extends AbstractComparator {
}
@Override
public double distance(final String a, final String b) {
public double compare(final Field a, final Field b) {
return 1.0;
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* The Class ExactMatch.
*
* @author claudio
*/
@ComparatorClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final Map<String, Number> params) {
super(params);
}
@Override
protected String getValue(final Field f) {
return super.getValue(f).replaceAll(PREFIX, "");
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@ComparatorClass("domainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase {
public DomainExactMatch(final Map<String, Number> params) {
super(params);
}
@Override
protected String getValue(final Field f) {
try {
return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) {
return "";
}
}
private URL asUrl(final String value) throws MalformedURLException {
return new URL(value);
}
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractComparator {
public ExactMatchIgnoreCase(Map<String, Number> params) {
super(params);
}
@Override
public double compare(Field a, Field b) {
final String fa = getValue(a);
final String fb = getValue(b);
if (fa.isEmpty() || fb.isEmpty())
return -1;
return fa.equalsIgnoreCase(fb) ? 1 : 0;
}
protected String getValue(final Field f) {
return getFirstValue(f);
}
}

View File

@ -1,12 +1,9 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.io.Serializable;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())

View File

@ -1,9 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,7 +1,5 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.adaptor.Pid;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ComparatorClass("pidMatch")
public class PidMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(PidMatch.class);
private Map<String, Number> params;
public PidMatch(final Map<String, Number> params) {
super(params);
this.params = params;
}
@Override
public double compare(final Field a, final Field b) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();
final List<Pid> pal = Pid.fromOafJson(sa);
final List<Pid> pbl = Pid.fromOafJson(sb);
if (pal.isEmpty() || pbl.isEmpty()) {
return -1;
}
final Set<String> pidAset = toHashSet(pal);
final Set<String> pidBset = toHashSet(pbl);
int incommon = Sets.intersection(pidAset, pidBset).size();
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
if (incommon + simDiff == 0) {
return 0.0;
}
return (double)incommon / (incommon + simDiff) > params.getOrDefault("threshold", 0.5).doubleValue() ? 1 : 0;
}
//lowercase + normalization of the pid before adding it to the set
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + normalizePid(pid.getValue()))
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Iterables;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* Returns true if the number of values in the fields is the same.
*
* @author claudio
*/
@ComparatorClass("sizeMatch")
public class SizeMatch extends AbstractComparator {
/**
* Instantiates a new size match.
*
* @param params
* the parameters
*/
public SizeMatch(final Map<String, Number> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b) {
if (a.isEmpty() || b.isEmpty())
return -1;
return Iterables.size(a) == Iterables.size(b) ? 1 : 0;
}
/**
* Checks if is empty.
*
* @param a
* the a
* @return true, if is empty
*/
protected boolean isEmpty(final Iterable<?> a) {
return (a == null) || Iterables.isEmpty(a);
}
}

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.algo.SortedSecondStringDistanceAlgo;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -2,8 +2,6 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;

View File

@ -0,0 +1,39 @@
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
*
* @author claudio
*
*/
@ComparatorClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractComparator {
public TitleVersionMatch(final Map<String, Number> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b) {
final String valueA = getFirstValue(a);
final String valueB = getFirstValue(b);
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils;
import java.util.Map;
/**
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
*
* @author claudio
*/
@ComparatorClass("yearMatch")
public class YearMatch extends AbstractComparator {
private int limit = 4;
public YearMatch(final Map<String, Number> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
}
protected boolean checkLength(final String s) {
return s.length() == limit;
}
protected String getFirstValue(final Field value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -19,6 +19,10 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
private Map<String, Number> params;
protected AbstractComparator(Map<String, Number> params) {
this.params = params;
}
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = 1.0;
@ -49,7 +53,9 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the d
* @return the double
*/
protected abstract double normalize(double d);
protected double normalize(double d) {
return d;
}
/**
* Distance.

View File

@ -4,6 +4,10 @@ import eu.dnetlib.pace.model.Field;
public interface Comparator {
/*
* return : -1 -> can't decide (missing field)
* >0 -> similarity degree (depends on the algorithm)
* */
public double compare(Field a, Field b);
}

View File

@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
@ -36,12 +35,10 @@ public class TreeNodeDef implements Serializable {
public TreeNodeDef() {
}
public double evaluate(MapDocument doc1, MapDocument doc2) {
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) {
DescriptiveStatistics stats = new DescriptiveStatistics();
double sumWeights = 0.0; //for the weighted mean
int missCount = 0; //counter for the number of misses
TreeNodeStats stats = new TreeNodeStats();
stats.setFieldsCount(fields.size());
for (FieldConf fieldConf : fields) {
@ -49,40 +46,20 @@ public class TreeNodeDef implements Serializable {
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
if (result >= 0.0) { //if the field is not missing
stats.addValue(weight * result);
sumWeights += weight; //sum weights, to be used in case of weighted mean
}
else { //if the field is missing
missCount += 1;
if (!fieldConf.isIgnoreMissing()){ //if the miss has not to be ignored
stats.addValue(weight * 0);
sumWeights += weight;
if (result == -1) { //if the field is missing
stats.incrementMissCount();
if (!fieldConf.isIgnoreMissing()) {
stats.incrementWeightsSum(weight);
}
}
else { //if the field is not missing
stats.incrementScoresSum(weight * result);
stats.incrementWeightsSum(weight);
}
}
//global ignoremissing (if one of the field is missing, return undefined)
if (!ignoreMissing && missCount>0) {
return -1;
}
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
case WEIGHTED_MEAN:
return stats.getSum()/sumWeights;
default:
return 0.0;
}
return stats;
}
private Comparator comparator(final FieldConf field){

View File

@ -0,0 +1,86 @@
package eu.dnetlib.pace.tree.support;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import java.io.Serializable;
public class TreeNodeStats implements Serializable {
private DescriptiveStatistics stats;
private int missCount = 0;
private int fieldsCount = 0;
private double weightsSum = 0.0;
public TreeNodeStats(){
this.stats = new DescriptiveStatistics();
}
public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) {
this.missCount = missCount;
this.fieldsCount = fieldsCount;
this.weightsSum = weightsSum;
}
public DescriptiveStatistics getStats() {
return stats;
}
public void setStats(DescriptiveStatistics stats) {
this.stats = stats;
}
public int getMissCount() {
return missCount;
}
public void setMissCount(int missCount) {
this.missCount = missCount;
}
public int getFieldsCount() {
return fieldsCount;
}
public void setFieldsCount(int fields) {
this.fieldsCount = fields;
}
public double getWeightsSum() {
return weightsSum;
}
public void setWeightsSum(double weightsSum) {
this.weightsSum = weightsSum;
}
public void incrementWeightsSum(double delta){
this.weightsSum += delta;
}
public void incrementMissCount(){
this.missCount += 1;
}
public void incrementScoresSum(double delta){
this.stats.addValue(delta);
}
public double getFinalScore(AggType aggregation){
switch (aggregation){
case AVG:
return stats.getMean();
case SUM:
return stats.getSum();
case MAX:
return stats.getMax();
case MIN:
return stats.getMin();
case WEIGHTED_MEAN:
return stats.getSum()/weightsSum;
default:
return 0.0;
}
}
}

View File

@ -0,0 +1,58 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
/**
* The compare between two documents is given by the weighted mean of the field distances
*/
public class TreeProcessor {
private static final Log log = LogFactory.getLog(TreeProcessor.class);
private Config config;
public TreeProcessor(final Config config) {
this.config = config;
}
public boolean compare(final MapDocument a, final MapDocument b) {
//evaluate the decision tree
return evaluateTree(a, b, config.decisionTree()) == MatchType.MATCH;
}
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2, final Map<String, TreeNodeDef> decisionTree){
String current = "start";
while (MatchType.parse(current)==MatchType.UNDEFINED) {
TreeNodeDef currentNode = decisionTree.get(current);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2);
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
current = currentNode.getUndefined();
}
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
current = currentNode.getPositive();
}
else {
current = currentNode.getNegative();
}
}
return MatchType.parse(current);
}
}

View File

@ -5,7 +5,7 @@ import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.distance.PairwiseComparison;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
@ -150,14 +150,10 @@ public class BlockProcessor {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final PairwiseComparison pairwiseComparison = new PairwiseComparison(dedupConf);
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
emitOutput(pairwiseComparison.compare(pivot, curr), idPivot, idCurr, context);
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
// final ScoreResult sr = similarity(algo, pivot, curr);
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
// emitOutput(sr, idPivot, idCurr, context);
// i++;
}
}
}

View File

@ -2,31 +2,21 @@ package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections CONDITION_RESOLVER = new Reflections("eu.dnetlib.pace.condition");
public static final Reflections DISTANCE_RESOLVER = new Reflections("eu.dnetlib.pace.compare.algo");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() {
@ -35,14 +25,6 @@ public class PaceResolver implements Serializable {
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
this.conditionAlgos = CONDITION_RESOLVER.getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
this.distanceAlgos = DISTANCE_RESOLVER.getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
@ -56,22 +38,6 @@ public class PaceResolver implements Serializable {
}
}
public DistanceAlgo getDistanceAlgo(String name, Map<String, Number> params) throws PaceException {
try {
return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + " not found ", e);
}
}
public ConditionAlgo getConditionAlgo(String name, List<FieldDef> fields) throws PaceException {
try {
return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + " not found ", e);
}
}
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);

View File

@ -1,7 +1,7 @@
package eu.dnetlib.pace.distance;
package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
import org.junit.Before;
import org.junit.Test;

View File

@ -1,7 +0,0 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.AbstractPaceTest;
public class ConditionTest extends AbstractPaceTest {
}

Some files were not shown because too many files have changed in this diff Show More