forked from D-Net/dnet-hadoop
implementation of new aggregation in the tree node processing
This commit is contained in:
parent
20fcfe6328
commit
b21b1b8f61
|
@ -41,7 +41,7 @@ public class CityMatch extends AbstractComparator {
|
||||||
else {
|
else {
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
return -1; //undefined if one of the two has no cities
|
return -1; //undefined if one of the two has no cities
|
||||||
return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0;
|
return commonElementsPercentage(codes1, codes2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,9 @@ public class ExactMatch extends AbstractComparator {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
|
return -1.0; //return -1 if a field is missing
|
||||||
|
}
|
||||||
return a.equals(b) ? 1.0 : 0;
|
return a.equals(b) ? 1.0 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,8 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
|
//TODO change this implementation, it needs only to erase cities and keywords
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class JsonListMatch extends AbstractComparator {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
return (double)incommon / (incommon + simDiff);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class KeywordMatch extends AbstractComparator {
|
||||||
else {
|
else {
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
return -1; //undefined if one of the two has no keywords
|
return -1; //undefined if one of the two has no keywords
|
||||||
return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0;
|
return commonElementsPercentage(codes1, codes2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class StringListMatch extends AbstractComparator {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
return (double)incommon / (incommon + simDiff);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -9,10 +9,8 @@ public enum AggType {
|
||||||
SUM,
|
SUM,
|
||||||
MAX,
|
MAX,
|
||||||
MIN,
|
MIN,
|
||||||
NC, //necessary condition
|
AND, //used for necessary conditions
|
||||||
SC, //sufficient condition
|
OR; //used for sufficient conditions
|
||||||
AND,
|
|
||||||
OR;
|
|
||||||
|
|
||||||
public static AggType getEnum(String value) {
|
public static AggType getEnum(String value) {
|
||||||
|
|
||||||
|
|
|
@ -11,20 +11,30 @@ import java.io.Serializable;
|
||||||
public class FieldStats implements Serializable {
|
public class FieldStats implements Serializable {
|
||||||
|
|
||||||
private double weight; //weight for the field (to be used in the aggregation)
|
private double weight; //weight for the field (to be used in the aggregation)
|
||||||
|
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition)
|
||||||
private double result; //the result of the comparison
|
private double result; //the result of the comparison
|
||||||
private Field a;
|
private Field a;
|
||||||
private Field b;
|
private Field b;
|
||||||
|
|
||||||
private boolean countIfUndefined;
|
private boolean countIfUndefined;
|
||||||
|
|
||||||
public FieldStats(double weight, double result, boolean countIfUndefined, Field a, Field b) {
|
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) {
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
|
this.threshold = threshold;
|
||||||
this.result = result;
|
this.result = result;
|
||||||
this.countIfUndefined = countIfUndefined;
|
this.countIfUndefined = countIfUndefined;
|
||||||
this.a = a;
|
this.a = a;
|
||||||
this.b = b;
|
this.b = b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public double getThreshold() {
|
||||||
|
return threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setThreshold(double threshold) {
|
||||||
|
this.threshold = threshold;
|
||||||
|
}
|
||||||
|
|
||||||
public double getWeight() {
|
public double getWeight() {
|
||||||
return weight;
|
return weight;
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
|
||||||
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
|
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -84,6 +84,32 @@ public class TreeNodeStats implements Serializable {
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//if at least one is true, return 1.0
|
||||||
|
public double or(){
|
||||||
|
for (FieldStats fieldStats : this.results.values()) {
|
||||||
|
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//if at least one is false, return 0.0
|
||||||
|
public double and() {
|
||||||
|
for (FieldStats fieldStats : this.results.values()) {
|
||||||
|
|
||||||
|
if (fieldStats.getResult() == -1) {
|
||||||
|
if (fieldStats.isCountIfUndefined())
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
public double getFinalScore(AggType aggregation){
|
public double getFinalScore(AggType aggregation){
|
||||||
|
|
||||||
switch (aggregation){
|
switch (aggregation){
|
||||||
|
@ -91,16 +117,16 @@ public class TreeNodeStats implements Serializable {
|
||||||
return scoreSum()/fieldsCount();
|
return scoreSum()/fieldsCount();
|
||||||
case SUM:
|
case SUM:
|
||||||
return scoreSum();
|
return scoreSum();
|
||||||
case SC:
|
|
||||||
case OR:
|
|
||||||
case MAX:
|
case MAX:
|
||||||
return max();
|
return max();
|
||||||
case NC:
|
|
||||||
case AND:
|
|
||||||
case MIN:
|
case MIN:
|
||||||
return min();
|
return min();
|
||||||
case W_MEAN:
|
case W_MEAN:
|
||||||
return weightedScoreSum()/weightSum();
|
return weightedScoreSum()/weightSum();
|
||||||
|
case OR:
|
||||||
|
return or();
|
||||||
|
case AND:
|
||||||
|
return and();
|
||||||
default:
|
default:
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class TreeProcessor{
|
||||||
TreeNodeDef currentNode = config.decisionTree().get(current);
|
TreeNodeDef currentNode = config.decisionTree().get(current);
|
||||||
//throw an exception if the node doesn't exist
|
//throw an exception if the node doesn't exist
|
||||||
if (currentNode == null)
|
if (currentNode == null)
|
||||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
throw new PaceException("Missing tree node: " + current);
|
||||||
|
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
treeStats.addNodeStats(current, stats);
|
treeStats.addNodeStats(current, stats);
|
||||||
|
|
|
@ -18,13 +18,9 @@ import java.util.function.Predicate;
|
||||||
|
|
||||||
public class MapDocumentUtil {
|
public class MapDocumentUtil {
|
||||||
|
|
||||||
|
|
||||||
private static final ObjectMapper mapper = new ObjectMapper();
|
|
||||||
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
||||||
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
|
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
|
||||||
MapDocument m = new MapDocument();
|
MapDocument m = new MapDocument();
|
||||||
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
|
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
|
||||||
|
|
|
@ -100,7 +100,6 @@ that
|
||||||
the
|
the
|
||||||
their
|
their
|
||||||
theirs
|
theirs
|
||||||
them
|
|
||||||
themselves
|
themselves
|
||||||
then
|
then
|
||||||
there
|
there
|
||||||
|
|
|
@ -1,14 +1,3 @@
|
||||||
0
|
|
||||||
1
|
|
||||||
2
|
|
||||||
3
|
|
||||||
4
|
|
||||||
5
|
|
||||||
6
|
|
||||||
7
|
|
||||||
8
|
|
||||||
9
|
|
||||||
_
|
|
||||||
a
|
a
|
||||||
actualmente
|
actualmente
|
||||||
acuerdo
|
acuerdo
|
||||||
|
@ -637,7 +626,6 @@ todavia
|
||||||
todavía
|
todavía
|
||||||
todo
|
todo
|
||||||
todos
|
todos
|
||||||
total
|
|
||||||
trabaja
|
trabaja
|
||||||
trabajais
|
trabajais
|
||||||
trabajamos
|
trabajamos
|
||||||
|
|
|
@ -211,7 +211,6 @@ encore
|
||||||
enfin
|
enfin
|
||||||
entre
|
entre
|
||||||
envers
|
envers
|
||||||
environ
|
|
||||||
es
|
es
|
||||||
essai
|
essai
|
||||||
est
|
est
|
||||||
|
|
|
@ -62,17 +62,16 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
||||||
|
|
||||||
assertEquals(0.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
||||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
||||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
||||||
assertEquals(0.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
||||||
assertEquals(0.0, keywordMatch.distance("University College London", "University of London", conf));
|
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
|
||||||
assertEquals(0.0, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
||||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -65,36 +65,23 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void asMapDocumentTest() throws Exception {
|
public void asMapDocumentTest() {
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||||
|
|
||||||
final String json = readFromClasspath("pub2.json");
|
final String json = readFromClasspath("organization.json");
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||||
|
|
||||||
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
||||||
|
|
||||||
|
|
||||||
System.out.println(mapDocument.getFieldMap().values().stream().map(Field::isEmpty).count());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJPath() {
|
public void testJPath() {
|
||||||
final String json = readFromClasspath("pub2.json");
|
final String json = readFromClasspath("organization.json");
|
||||||
|
|
||||||
final String jpath ="$.pid";
|
|
||||||
|
|
||||||
|
|
||||||
final List<String> jPathList = MapDocumentUtil.getJPathList(jpath, json, Type.JSON);
|
|
||||||
|
|
||||||
System.out.println("jPathList = " + jPathList);
|
|
||||||
|
|
||||||
|
final String jpath ="$.id";
|
||||||
|
|
||||||
|
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"wf" : {
|
"wf" : {
|
||||||
"threshold" : "0.9",
|
"threshold" : "0.99",
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "legalname",
|
||||||
|
@ -8,7 +8,9 @@
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true",
|
||||||
|
"maxIterations": "20",
|
||||||
|
"idPath": "$.id"
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
@ -23,59 +25,110 @@
|
||||||
{
|
{
|
||||||
"field": "gridid",
|
"field": "gridid",
|
||||||
"comparator": "exactMatch",
|
"comparator": "exactMatch",
|
||||||
"weight": 1.0,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1.0,
|
"threshold": 1,
|
||||||
"aggregation": "MAX",
|
"aggregation": "SC",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "layer2",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer2",
|
"undefined": "layer2",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "false"
|
||||||
},
|
},
|
||||||
"layer2": {
|
"layer2": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "websiteurl",
|
"field": "websiteurl",
|
||||||
"comparator": "domainExactMatch",
|
"comparator": "domainExactMatch",
|
||||||
"weight": 1.0,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "country",
|
"field": "country",
|
||||||
"comparator": "exactMatch",
|
"comparator": "exactMatch",
|
||||||
"weight": 1.0,
|
"weight": 1,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "numbersMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "romansMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1.0,
|
"threshold": 1,
|
||||||
"aggregation": "MIN",
|
"aggregation": "NC",
|
||||||
"positive": "layer3",
|
"positive": "layer3",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer3",
|
"undefined": "layer3",
|
||||||
"ignoreUndefined": "false"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"layer3": {
|
"layer3": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "legalname",
|
"field": "legalname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "cityMatch",
|
||||||
"weight": 0.9,
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.1,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "layer4",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer4": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "keywordMatch",
|
||||||
|
"weight": 1.0,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {
|
"params": {
|
||||||
"windowSize": 4,
|
"windowSize": "4"
|
||||||
"threshold": 0.7
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.7,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "layer5",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer5",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"layer5": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
|
"weight": 0.9,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "legalshortname",
|
"field": "legalshortname",
|
||||||
"comparator": "jaroWinklerNormalizedName",
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
"weight": 0.1,
|
"weight": 0.1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -90,9 +143,9 @@
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
|
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
|
||||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
||||||
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
|
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
|
||||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
|
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid=='grid.ac')].value"}
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {
|
||||||
"legalname" : []
|
"legalname" : []
|
||||||
|
@ -192,7 +245,7 @@
|
||||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
||||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
||||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
||||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
||||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
||||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
||||||
|
@ -202,7 +255,10 @@
|
||||||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
||||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
||||||
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"]
|
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
||||||
|
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
||||||
|
"key::107" : ["agricultural forestry", "af", "a f", "a&f"],
|
||||||
|
"key::108" : ["agricultural mechanical", "am", "a m", "a&m"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000000985"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0004 0478 6426"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000126"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000190"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100000205"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005822"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005823"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"100005824"},{"qualifier":{"classid":"OrgRef","classname":"OrgRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"70243"},{"qualifier":{"classid":"Wikidata","classname":"Wikidata","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"Q503577"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.239119.1"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"USDoC"},"websiteurl":{"value":"http://www.commerce.gov/"},"country":{"classid":"US","classname":"United States","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Departamento de Comercio de Estados Unidos"},{"value":"Département du commerce des États-unis"},{"value":"United States Department of Commerce"},{"value":"United States Department of Commerce and Labor"}],"legalname":{"value":"United States Department of Commerce"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::051dc42607887282d1939f094e5906f5"}
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"wf" : {
|
"wf" : {
|
||||||
"threshold" : "0.9",
|
"threshold" : "0.99",
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "legalname",
|
||||||
|
@ -8,7 +8,8 @@
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true",
|
||||||
|
"maxIterations": "20"
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
@ -18,21 +19,124 @@
|
||||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
"start": {
|
||||||
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
"fields": [
|
||||||
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
|
{
|
||||||
|
"field": "gridid",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1,
|
||||||
|
"aggregation": "SC",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer2",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"layer2": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "websiteurl",
|
||||||
|
"comparator": "domainExactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "country",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1,
|
||||||
|
"aggregation": "NC",
|
||||||
|
"positive": "layer3",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer3",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer3": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "cityMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4",
|
||||||
|
"threshold": "0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1.0,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "layer4",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer4": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "keywordMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4",
|
||||||
|
"threshold": "0.7"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1.0,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "layer5",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "layer5",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"layer5": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "legalname",
|
||||||
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
|
"weight": 0.9,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
"windowSize": "4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "legalshortname",
|
||||||
|
"comparator": "jaroWinklerNormalizedName",
|
||||||
|
"weight": 0.1,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.9,
|
||||||
|
"aggregation": "W_MEAN",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
|
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
|
||||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
|
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
|
||||||
{ "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" },
|
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
|
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
|
||||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
|
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {
|
||||||
"legalname" : []
|
"legalname" : []
|
||||||
},
|
},
|
||||||
"synonyms": {
|
"synonyms": {}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue