diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java index ead0503..c1a0284 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java @@ -33,13 +33,13 @@ public class SparkTest { public static void main(String[] args) { final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]")); - final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/authors.json");//"/eu/dnetlib/pace/orgs2.json"); + final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/organization.to.fix.json"); final JavaRDD dataRDD = context.textFile(dataset.getPath()); counter = new SparkCounter(context); //read the configuration from the classpath - final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/authors.test.pace.conf"));//"/eu/dnetlib/pace/organization.test2.pace.conf")); + final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/org.curr.conf")); BlockProcessor.constructAccumulator(config); BlockProcessor.accumulators.forEach(acc -> { @@ -57,16 +57,26 @@ public class SparkTest { RDD> vertexes = mapDocs.mapToPair(t -> new Tuple2( (long) t._1().hashCode(), t._2())).rdd(); //create relations between documents - final JavaPairRDD relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id + JavaPairRDD> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id //Clustering: from to List .flatMapToPair(a -> { final MapDocument currentDocument = a._2(); return getGroupingKeys(config, currentDocument).stream() .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator(); - }).groupByKey() //group documents basing on the key - //create relations by comparing only elements in the same group - .flatMapToPair(it -> { + }).groupByKey();//group documents basing on the key + + //print blocks + blocks.foreach(b -> { + String print = b._1() + ": "; + for (MapDocument doc : b._2()) { + print += doc.getIdentifier() + " "; + } + System.out.println(print); + }); + + //create relations by comparing only elements in the same group + final JavaPairRDD relationRDD = blocks.flatMapToPair(it -> { final SparkReporter reporter = new SparkReporter(counter); new BlockProcessor(config).process(it._1(), it._2(), reporter); return reporter.getReport().iterator(); @@ -85,12 +95,22 @@ public class SparkTest { counter.getAccumulators().values().forEach(it-> System.out.println(it.getGroup()+" "+it.getName()+" -->"+it.value())); - connectedComponents.foreach(cc -> System.out.println("cc = " + cc.toString() + " size =" + cc.getDocs().size())); - nonDeduplicated.foreach(cc -> System.out.println("nd = " + cc.toString())); + //print deduped + connectedComponents.foreach(cc -> { + System.out.println("cc = " + cc.getId()); + for (MapDocument doc: cc.getDocs()) { + System.out.println(doc.getIdentifier() + "; ln: " + doc.getFieldMap().get("legalname").stringValue() + "; sn: " + doc.getFieldMap().get("legalshortname").stringValue()); + } + }); + //print nondeduped + nonDeduplicated.foreach(cc -> { + System.out.println("nd = " + cc.getId()); + System.out.println(cc.getDocs().iterator().next().getFieldMap().get("legalname").stringValue() + "; sn: " + cc.getDocs().iterator().next().getFieldMap().get("legalshortname").stringValue()); + }); - //print ids -// ccs.foreach(cc -> System.out.println(cc.getId())); -// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); + //print ids +//// ccs.foreach(cc -> System.out.println(cc.getId())); +//// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); } diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf new file mode 100644 index 0000000..fd4fbbe --- /dev/null +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -0,0 +1,36 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "exactMatch", "fields" : [ "country" ] }, + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + ], + "model" : [ + { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { } + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json new file mode 100644 index 0000000..b56a10f --- /dev/null +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json @@ -0,0 +1,24 @@ +{"dateoftransformation": "2018-09-13", "originalId": ["opendoar____::Fonds_zur_F\u00f6rderung_der_wissenschaftlichen_Forschung_(Austrian_Science_Fund)"], "collectedfrom": [{"value": "OpenDOAR", "key": "10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "FWF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.fwf.ac.at/"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Fonds zur F\u00f6rderung der wissenschaftlichen Forschung (Austrian Science Fund)"}, "country": {"classid": "AT", "classname": "Austria", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2015-08-24", "type": 20, "id": "20|opendoar____::77e7cd67c60d0c18aa835ea6ea58122c"} +{"dateoftransformation": "2018-12-15", "originalId": ["corda__h2020::998735960"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020", "key": "10|openaire____::a55eb91348674d853191f4f4fd73d078"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "FWF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.fwf.ac.at"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "FONDS ZUR F\u00d6RDERUNG DER WISSENSCHAFTLICHEN FORSCHUNG"}, "country": {"classid": "AT", "classname": "Austria", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-03-12", "type": 20, "id": "20|corda__h2020::83f579158b682262181b9a8ffdfa1124"} +{"dateoftransformation": "2018-11-20", "originalId": ["corda_______::998735960"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse", "key": "10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "FWF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.fwf.ac.at"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "FONDS ZUR F\u00d6RDERUNG DER WISSENSCHAFTLICHEN FORSCHUNG"}, "country": {"classid": "AT", "classname": "Austria", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}}}, "dateofcollection": "2018-03-12", "type": 20, "id": "20|corda_______::83f579158b682262181b9a8ffdfa1124"} +{"dateoftransformation": "2018-09-27", "originalId": ["re3data_____::9f4430cdb5474d6db4bf84834533a7c9"], "collectedfrom": [{"value": "Registry of Research Data Repository", "key": "10|openaire____::21f8a223b9925c2f87c404096080b046"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "FWF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "https://www.fwf.ac.at/en/"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Fonds zur F\u00f6rderung der wissenschaftlichen Forschung"}, "country": {"classid": "AT", "classname": "Austria", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-09-27", "type": 20, "id": "20|re3data_____::a3ac0376cc2a582357d821cec70a3e5b"} +{"dateoftransformation": "2018-12-15", "originalId": ["corda__h2020::999861936"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020", "key": "10|openaire____::a55eb91348674d853191f4f4fd73d078"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "UNITO"}, "ecresearchorganization": {"value": "true"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.unito.it"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "UNIVERSITA DEGLI STUDI DI TORINO"}, "country": {"classid": "IT", "classname": "Italy", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "true"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-03-12", "type": 20, "id": "20|corda__h2020::ef77a7bbe5796b0b47aa60947a5c6f41"} +{"dateoftransformation": "2018-11-20", "originalId": ["corda_______::999861936"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse", "key": "10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "UNITO"}, "ecresearchorganization": {"value": "true"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.unito.it"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "UNIVERSITA DEGLI STUDI DI TORINO"}, "country": {"classid": "IT", "classname": "Italy", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "true"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-03-12", "type": 20, "id": "20|corda_______::ef77a7bbe5796b0b47aa60947a5c6f41"} +{"dateoftransformation": "2018-09-13", "originalId": ["nih_________::UNIVERSITA_DI_TORINO"], "collectedfrom": [{"value": "NIH - National Institutes of Health", "key": "10|openaire____::9e9e8c76d739212c63eff362e321ba33"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecresearchorganization": {"value": "false"}, "ecenterprise": {"value": "false"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "UNIVERSITA DI TORINO"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-07-11", "type": 20, "id": "20|nih_________::fdd37fcef9df7c69ae7d620bf21ab272"} +{"dateoftransformation": "2018-09-19", "originalId": ["doajarticles::Universit\u00e0_degli_Studi_di_Torino"], "collectedfrom": [{"value": "DOAJ-Articles", "key": "10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "Universit\u00e0 degli Studi di Torino"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Universit\u00e0 degli Studi di Torino"}, "country": {"classid": "IT", "classname": "Italy", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-09-19", "type": 20, "id": "20|doajarticles::f7ef827f8fe1d870b6464ef1affc9605"} +{"dateoftransformation": "2018-11-12", "originalId": ["opendoar____::Universit\u00e0_degli_Studi_di_Torino"], "collectedfrom": [{"value": "OpenDOAR", "key": "10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecresearchorganization": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.unito.it/"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Universit\u00e0 degli Studi di Torino"}, "country": {"classid": "IT", "classname": "Italy", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-11-12", "type": 20, "id": "20|opendoar____::f7ef827f8fe1d870b6464ef1affc9605"} +{"collectedfrom": [{"value": "GRID - Global Research Identifier Database", "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"}], "organization": {"metadata": {"legalshortname": {"value": "RPF"}, "websiteurl": {"value": "http://www.research.org.cy/EN/index.html/"}, "country": {"classid": "CY", "classname": "Cyprus", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "legalname": {"value": "RPF"}}}, "pid": [{"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.14751.36"}], "id": "20|grid________::4f35352983a82950563eadfea49dc867", "type": 20} +{"collectedfrom": [{"value": "GRID - Global Research Identifier Database", "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"}], "organization": {"metadata": {"legalshortname": {"value": "RPF"}, "websiteurl": {"value": "http://www.research.org.cy/EN/index.html/"}, "country": {"classid": "CY", "classname": "Cyprus", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "legalname": {"value": "Research Promotion Foundation"}}}, "pid": [{"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.14751.36"}], "id": "20|grid________::a42b3c67ea94b54ee941fb42fefd51d6", "type": 20} +{"dateoftransformation": "2018-08-08", "originalId": ["corda__h2020::999946035"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020", "key": "10|openaire____::a55eb91348674d853191f4f4fd73d078"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "RPF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.research.org.cy"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "IDRYMA PROOTHISIS EREVNAS"}, "country": {"classid": "CY", "classname": "Cyprus", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-01-21", "type": 20, "id": "20|corda__h2020::a16918f80d830bf2b6daa5ec304f0e31"} +{"dateoftransformation": "2018-08-08", "originalId": ["corda_______::999946035"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse", "key": "10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "RPF"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.research.org.cy"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "RESEARCH PROMOTION FOUNDATION"}, "country": {"classid": "CY", "classname": "Cyprus", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2015-09-10", "type": 20, "id": "20|corda_______::a16918f80d830bf2b6daa5ec304f0e31"} +{"collectedfrom": [{"value": "GRID - Global Research Identifier Database", "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"}], "organization": {"metadata": {"legalshortname": {"value": "DFG"}, "websiteurl": {"value": "http://www.dfg.de/en/"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "legalname": {"value": "Deutsche Forschungsgemeinschaft"}}}, "pid": [{"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.424150.6"}], "id": "20|grid________::7d83de934ecd5091d83334f752cef22c", "type": 20} +{"dateoftransformation": "2018-08-08", "originalId": ["corda_______::999547462"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse", "key": "10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "ecnonprofit": {"value": "true"}, "websiteurl": {"value": "http://www.dfg.de"}, "ecnutscode": {"value": "false"}, "legalname": {"value": "DEUTSCHE FORSCHUNGSGEMEINSCHAFT"}}}, "dateofcollection": "2015-09-10", "type": 20, "id": "20|corda_______::3f41cfb7d56cfea69f3ce9792b822eb4"} +{"dateoftransformation": "2018-09-28", "originalId": ["dfgf________::DFG"], "collectedfrom": [{"value": "", "key": ""}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "DFG"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Deutsche Forschungsgemeinschaft"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-09-28", "type": 20, "id": "20|dfgf________::3bbe57698e353a2acaa03306316658bb"} +{"dateoftransformation": "2018-09-28", "originalId": ["dfgf________::DFGF"], "collectedfrom": [{"value": "", "key": ""}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "DFG"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Deutsche Forschungsgemeinschaft"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2018-09-28", "type": 20, "id": "20|dfgf________::14a2847759c496334d510ff8fafbd464"} +{"dateoftransformation": "2018-06-04", "originalId": ["re3data_____::bf9c8e5c69ff222e3ee2ff0fc4d2b289"], "collectedfrom": [{"value": "Registry of Research Data Repository", "key": "10|openaire____::21f8a223b9925c2f87c404096080b046"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "German Research Foundation"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.dfg.de/"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "Deutsche Forschungsgemeinschaft"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-01-07", "type": 20, "id": "20|re3data_____::fbb08ab5e8cf8cd1056f61b84ddf05dd"} +{"originalId": ["https://academic.microsoft.com/#/detail/87707601"], "pid": [{"qualifier": {"classid": "urn", "classname": "urn", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "http://en.wikipedia.org/wiki/Deutsche_Forschungsgemeinschaft"}, {"qualifier": {"classid": "mag_id", "classname": "Microsoft Academic Graph Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "https://academic.microsoft.com/#/detail/87707601"}, {"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.424150.6"}], "collectedfrom": [{"value": "Microsoft Academic Graph", "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}], "organization": {"metadata": {"websiteurl": {"value": "http://www.dfg.de/"}, "legalname": {"value": "Deutsche Forschungsgemeinschaft"}}}, "type": 20, "id": "20|microsoft___::e2edddabcc31b692b4ca7b89456e750a"} +{"dateoftransformation": "2018-08-08", "originalId": ["corda__h2020::999547462"], "collectedfrom": [{"value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020", "key": "10|openaire____::a55eb91348674d853191f4f4fd73d078"}], "organization": {"metadata": {"eclegalbody": {"value": "true"}, "eclegalperson": {"value": "true"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "DFG"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "true"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.dfg.de"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "DEUTSCHE FORSCHUNGSGEMEINSCHAFT"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-01-21", "type": 20, "id": "20|corda__h2020::3f41cfb7d56cfea69f3ce9792b822eb4"} +{"dateoftransformation": "2018-06-04", "originalId": ["re3data_____::64ef0759fcfccf84cca028ba3c21aa23"], "collectedfrom": [{"value": "Registry of Research Data Repository", "key": "10|openaire____::21f8a223b9925c2f87c404096080b046"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "Deutsche Forschungsgemeinschaft"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.dfg.de/en/index.jsp"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "German Research Foundation"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-01-07", "type": 20, "id": "20|re3data_____::e029b7e0de6cafc0c7126615c65458f0"} +{"dateoftransformation": "2018-06-04", "originalId": ["re3data_____::37e3bba353f88b4649d459c698483f6e"], "collectedfrom": [{"value": "Registry of Research Data Repository", "key": "10|openaire____::21f8a223b9925c2f87c404096080b046"}], "organization": {"metadata": {"eclegalbody": {"value": "false"}, "eclegalperson": {"value": "false"}, "ecinternationalorganization": {"value": "false"}, "legalshortname": {"value": "Deutsche Forschungsgemeinschaft"}, "ecresearchorganization": {"value": "false"}, "ecnonprofit": {"value": "false"}, "ecenterprise": {"value": "false"}, "websiteurl": {"value": "http://www.dfg.de/en/index.jsp"}, "ecnutscode": {"value": "false"}, "ecinternationalorganizationeurinterests": {"value": "false"}, "legalname": {"value": "German Research Association"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "echighereducation": {"value": "false"}, "ecsmevalidated": {"value": "false"}}}, "dateofcollection": "2016-01-07", "type": 20, "id": "20|re3data_____::2080dc170e6cd7c6c06f403f8a08c1be"} +{"collectedfrom": [{"value": "GRID - Global Research Identifier Database", "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"}], "organization": {"metadata": {"legalshortname": {"value": "DFG"}, "websiteurl": {"value": "http://www.dfg.de/en/"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "legalname": {"value": "DFG"}}}, "pid": [{"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.424150.6"}], "id": "20|grid________::085fd89ec6f3f92c354e0bc027de2a58", "type": 20} +{"collectedfrom": [{"value": "GRID - Global Research Identifier Database", "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"}], "organization": {"metadata": {"legalshortname": {"value": "DFG"}, "websiteurl": {"value": "http://www.dfg.de/en/"}, "country": {"classid": "DE", "classname": "Germany", "schemename": "dnet:countries", "schemeid": "dnet:countries"}, "legalname": {"value": "German Research Foundation"}}}, "pid": [{"qualifier": {"classid": "grid", "classname": "grid", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "grid.424150.6"}], "id": "20|grid________::f0d88189673738d2a565aff99eeb59a2", "type": 20} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/tree/TreeProcessingTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/tree/TreeProcessingTest.java deleted file mode 100644 index f232752..0000000 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/tree/TreeProcessingTest.java +++ /dev/null @@ -1,93 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.tree.support.MatchType; -import eu.dnetlib.pace.util.BlockProcessor; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; - -import static org.junit.Assert.assertTrue; - -public class TreeProcessingTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(TreeProcessingTest.class); - - private DedupConfig config; - - @Before - public void setup(){ - config = getAuthorsTestConf(); - } - - @Test - public void testOrcidMatch (){ - - final MapDocument authorA = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5")); - final MapDocument authorB = author("id2", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID2", "pubDOI2", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5")); - final MapDocument authorC = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0001", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5")); - - log.info("Author 1 = " + authorA); - log.info("Author 2 = " + authorB); - log.info("Author 3 = " + authorC); - - MatchType matchType1 = new BlockProcessor(config).navigateTree(authorA, authorB); - MatchType matchType2 = new BlockProcessor(config).navigateTree(authorA, authorC); - - log.info("1 vs 2 Match Type = " + matchType1); - log.info("1 vs 3 Match Type = " + matchType2); - - assertTrue(matchType1 == MatchType.ORCID_MATCH); - assertTrue(matchType2 == MatchType.NO_MATCH); - } - - @Test - public void testCoauthorsMatch() { - final MapDocument authorA = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - final MapDocument authorB = author("id2", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID2", "pubDOI2", 1, "", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - - log.info("Author 1 = " + authorA); - log.info("Author 2 = " + authorB); - - MatchType matchType = new BlockProcessor(config).navigateTree(authorA, authorB); - - log.info("Match Type = " + matchType); - - assertTrue(matchType == MatchType.COAUTHORS_MATCH); - } - - @Test - public void testTopicsMatch() { - final MapDocument authorA = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - final MapDocument authorB = author("id2", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID2", "pubDOI2", 1, "", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5")); - - log.info("Author 1 = " + authorA); - log.info("Author 2 = " + authorB); - - MatchType matchType = new BlockProcessor(config).navigateTree(authorA, authorB); - - log.info("Match Type = " + matchType); - - assertTrue(matchType == MatchType.TOPICS_MATCH); - } - - @Test - public void testNoMatch() { - - final MapDocument authorA = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - final MapDocument authorB = author("id1", "1", "john", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "0000-0000-0000-0000", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - final MapDocument authorC = author("id2", "1", "jesus f.", "smith", "smith, john", new Double[]{0.0,0.5,0.0,0.5}, "pubID1", "pubDOI1", 1, "", Arrays.asList("coauthor1", "coauthor2", "coauthor3", "coauthor4", "coauthor5", "coauthor6")); - - MatchType matchType1 = new BlockProcessor(config).navigateTree(authorA,authorB); - MatchType matchType2 = new BlockProcessor(config).navigateTree(authorA,authorC); - - assertTrue(matchType1 == MatchType.NO_MATCH); //same identifier - assertTrue(matchType2 == MatchType.NO_MATCH); //not similar firstname - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index f9192ad..1782b87 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,16 +1,15 @@ package eu.dnetlib.pace.clustering; -import java.util.*; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import com.google.common.collect.Sets; - import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; @@ -26,7 +25,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::normalize) - .map(s -> filterStopWords(s, stopwords)) + .map(s -> filterAllStopWords(s)) .map(this::doApply) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 7572411..977c5c4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -27,7 +27,12 @@ import java.util.regex.Pattern; */ public abstract class AbstractPaceFunctions { - protected static Set stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); + protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); + protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); + protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); + protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); @@ -42,8 +47,9 @@ public abstract class AbstractPaceFunctions { } protected String cleanup(final String s) { - final String s1 = nfd(s); - final String s2 = fixAliases(s1); + final String s0 = s.toLowerCase(); + final String s1 = fixAliases(s0); + final String s2 = nfd(s1); final String s3 = s2.replaceAll("–", " "); final String s4 = s3.replaceAll("&", " "); final String s5 = s4.replaceAll(""", " "); @@ -140,6 +146,18 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } + protected String filterAllStopWords(String s) { + + s = filterStopWords(s, stopwords_en); + s = filterStopWords(s, stopwords_de); + s = filterStopWords(s, stopwords_it); + s = filterStopWords(s, stopwords_fr); + s = filterStopWords(s, stopwords_pt); + s = filterStopWords(s, stopwords_es); + + return s; + } + protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { final Set newset = Sets.newLinkedHashSet(); for (final String s : set) { @@ -192,15 +210,7 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } - public String normalizeCities(String s1, Map cityMap){ - //TODO change normalization mode - - for (String city : cityMap.keySet()) - s1 = s1.replaceAll(" " + city + " ", " " + cityMap.get(city) + " "); - return s1; - } - - public String normalizeCities2 (String s1, Map cityMap, int windowSize){ + public String keywordsToCode(String s1, Map translationMap, int windowSize){ List tokens = Arrays.asList(s1.split(" ")); @@ -213,9 +223,8 @@ public abstract class AbstractPaceFunctions { for (int i = 0; i<=tokens.size()-length; i++){ String candidate = Joiner.on(" ").join(tokens.subList(i, i + length)); - if (cityMap.containsKey(candidate)) { - s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + cityMap.get(candidate) + " "); - return s1; + if (translationMap.containsKey(candidate)) { + s1 = (" " + s1 + " ").replaceAll(" " + candidate + " ", " " + translationMap.get(candidate) + " "); } } length-=1; @@ -229,9 +238,20 @@ public abstract class AbstractPaceFunctions { final String regexKey = "\\bkey::[0-9]*\\b"; final String regexCity = "\\bcity::[0-9]*\\b"; return s.replaceAll(regexKey, "").replaceAll(regexCity, "").trim(); - } + public double keywordsCompare(String s1, String s2){ + + List keywords1 = getKeywords(s1); + List keywords2 = getKeywords(s2); + int longer = (keywords1.size()>keywords2.size())?keywords1.size():keywords2.size(); + + if (getKeywords(s1).isEmpty() || getKeywords(s2).isEmpty()) + return 1.0; + else + return (double)CollectionUtils.intersection(getKeywords(s1),getKeywords(s2)).size()/(double)longer; + } + //check if 2 strings have same keywords public boolean sameKeywords(String s1, String s2){ //at least 1 keyword in common diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index ebaa0ea..fea74af 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -47,27 +47,25 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { cb = removeStopwords(cb); //replace keywords with codes - ca = translate(ca, translationMap); - cb = translate(cb, translationMap); + String codesA = keywordsToCode(ca, translationMap, params.getOrDefault("windowSize", 4).intValue()); + String codesB = keywordsToCode(cb, translationMap, params.getOrDefault("windowSize",4).intValue()); //replace cities with codes -// String norm = normalizeCities(" " + ca + " ||| " + cb + " ", cityMap); -// ca = norm.split("\\|\\|\\|")[0].trim(); -// cb = norm.split("\\|\\|\\|")[1].trim(); + codesA = keywordsToCode(codesA, cityMap, params.getOrDefault("windowSize", 4).intValue()); + codesB = keywordsToCode(codesB, cityMap, params.getOrDefault("windowSize", 4).intValue()); - ca = normalizeCities2(ca, cityMap, params.getOrDefault("windowSize", 4).intValue()); - cb = normalizeCities2(cb, cityMap, params.getOrDefault("windowSize", 4).intValue()); - - if (sameCity(ca,cb)){ - if (sameKeywords(ca,cb)){ - ca = removeCodes(ca); - cb = removeCodes(cb); - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - } + //if two names have same city + if (sameCity(codesA,codesB)){ + if (keywordsCompare(codesA, codesB)>params.getOrDefault("threshold", 0.5).doubleValue()) { + ca = removeCodes(codesA); + cb = removeCodes(codesB); + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca,cb)); + } } + return 0.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 1cd0eb3..01da9c2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -9,8 +9,6 @@ import eu.dnetlib.pace.distance.eval.ScoreResult; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; -import eu.dnetlib.pace.model.TreeNodeDef; -import eu.dnetlib.pace.tree.support.MatchType; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -45,67 +43,13 @@ public class BlockProcessor { if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); //process(q, context); + process(simplifyQueue(q, key, context), context); - //process the decision tree if it is specified, otherwise go with conditions and distance algos - if (!dedupConf.getPace().getDecisionTree().isEmpty()){ - processPersons(q, context); - } - else { - process(simplifyQueue(q, key, context), context); - } } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); } } - private void processPersons(final Queue queue, final Reporter context) { - - while (!queue.isEmpty()) { - - final MapDocument pivot = queue.remove(); //take first element of the queue - final String idPivot = pivot.getIdentifier(); - - //compare the first element with all the others - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); - - //check if pivot and current element are similar by processing the tree - if (navigateTree(pivot, curr)!=MatchType.NO_MATCH) - writeSimilarity(context, idPivot, idCurr); - } - } - } - - public MatchType navigateTree(final MapDocument doc1, final MapDocument doc2){ - - final Map decisionTree = dedupConf.getPace().getDecisionTree(); - - String current = "start"; - - while (MatchType.getEnum(current)==MatchType.UNDEFINED) { - - TreeNodeDef currentNode = decisionTree.get(current); - //throw an exception if the node doesn't exist - if (currentNode == null) - throw new PaceException("The Tree Node doesn't exist: " + current); - - double similarity = currentNode.evaluate(doc1, doc2); - - if (similarity == -1) { - current = currentNode.getUndefined(); - } - else if (similarity>=currentNode.getThreshold()){ - current = currentNode.getPositive(); - } - else { - current = currentNode.getNegative(); - } - - } - - return MatchType.getEnum(current); - } - private Queue prepare(final Iterable documents) { final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv index 720b823..5f70a37 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/city_map.csv @@ -8595,4 +8595,4 @@ city::890299;Harare;Arare;Charare;Gorad Kharareh;HRE;Harare;Hararensis Urbs;Hara city::890422;Gweru;GWE;Gveru;Gwelo;Gweru;Гверу; city::893697;Chinhoyi;Chinhoyi;Chinkhoi;Chinkhoji;Chinoyi;Cinhojis;Sinoia;qi nuo yi;Činhojis;Чинхойи;Чинхої;Чинхоји;چینہوئی;奇諾伊; city::894701;Bulawayo;BUQ;Bulavajas;Bulavajo;Bulavejo;Bulawayo;Gorad Bulavajo;bu la wa yue;bullawayo;burawayo;Булавайо;Булавајо;Булавейо;Горад Булаваё;ブラワヨ;布拉瓦约;불라와요; -city::1106542;Chitungwiza;Chitungviza;Chitungwiza;Citungviza;Gorad Chytungviza;chytwngwyza;Čitungviza;Горад Чытунгвіза;Читунгвиза;Читунгвіза;چیتونگویزا; +city::1106542;Chitungwiza;Chitungviza;Chitungwiza;Citungviza;Gorad Chytungviza;chytwngwyza;Čitungviza;Горад Чытунгвіза;Читунгвиза;Читунгвіза;چیتونگویزا; \ No newline at end of file diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt index 2003b42..2ce975b 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_it.txt @@ -611,7 +611,6 @@ terzo th ti titolo -torino tra tranne tre diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index cd14796..55d114c 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -1,11 +1,11 @@ -key::1;university;università;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο +key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;Uniwersytet;университет;universiteit;πανεπιστήμιο key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::5;hospital;ospedale;hôpital;hospital;hospital;Krankenhaus;szpital;больница;ziekenhuis;νοσοκομείο -key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα +key::6;research;ricerca;recherche;investigacion;pesquisa;Forschung;badania;исследования;onderzoek;έρευνα;erevna;erevnas key::7;college;collegio;université;colegio;faculdade;Hochschule;Szkoła Wyższa;Высшая школа;universiteit;κολλέγιο -key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα +key::8;foundation;fondazione;fondation;fundación;fundação;Stiftung;Fundacja;фонд;stichting;ίδρυμα;idryma key::9;center;centro;centre;centro;centro;zentrum;centrum;центр;centrum;κέντρο key::10;national;nazionale;national;nationale;nationaux;nationales;nacional;nacional;national;krajowy;национальный;nationaal;nationale;εθνικό key::11;association;associazione;association;asociación;associação;Verein;verband;stowarzyszenie;ассоциация;associatie @@ -44,4 +44,60 @@ key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρω key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί key::45;institution;istituzione;institution;институциональный;instelling;ινστιτούτο key::46;division;divisione;division;отделение;divisie;τμήμα -key::47;committee;comitato;comité;комитет;commissie;επιτροπή \ No newline at end of file +key::47;committee;comitato;comité;комитет;commissie;επιτροπή +key::48;promotion;promozione;продвижение;proothisis;forderung +key::49;medical;medicine;clinical;medicina;clinici;médico;medicina;clínica;médico;medicina;clínica;medizinisch;Medizin;klinisch;medisch;geneeskunde;klinisch;ιατρικός;ιατρική;ιατρικό;ιατρικά;κλινικός;κλινική;κλινικό;κλινικά;tıbbi;tıp;klinik;orvosi;orvostudomány;klinikai;zdravniški;medicinski;klinični;meditsiini;kliinik;kliiniline; +key::50;technology;technological;tecnologia;tecnologie;tecnología;tecnológico;tecnologia;tecnológico;Technologie;technologisch;technologie;technologisch;τεχνολογία;τεχνολογικός;τεχνολογική;τεχνολογικό;teknoloji;teknolojik;technológia;technológiai;tehnologija;tehnološki;tehnoloogia;tehnoloogiline; +key::51;science;scientific;scienza;scientifiche;scienze;ciencia;científico;ciência;científico;Wissenschaft;wissenschaftlich;wetenschap;wetenschappelijk;επιστήμη;επιστημονικός;επιστημονική;επιστημονικό;επιστημονικά;bilim;bilimsel;tudomány;tudományos;znanost;znanstveni;teadus;teaduslik; +key::52;engineering;ingegneria;ingeniería;engenharia;Ingenieurwissenschaft;ingenieurswetenschappen;bouwkunde;μηχανικός;μηχανική;μηχανικό;mühendislik;mérnöki;Inženirstvo;inseneeria;inseneri; +key::53;management;gestione;gestionale;gestionali;gestión;administración;gestão;administração;Verwaltung;management;διαχείριση;yönetim;menedzsment;vodstvo;upravljanje;management;juhtkond;juhtimine;haldus; +key::54;energy;energia;energía;energia;Energie;energie;ενέργεια;enerji;energia;energija;energia; +key::55;agricultural;agriculture;agricoltura;agricole;agrícola;agricultura;agrícola;agricultura;landwirtschaftlich;Landwirtschaft;landbouwkundig;landbouw;αγροτικός;αγροτική;αγροτικό;γεωργικός;γεωργική;γεωργικό;γεωργία;tarımsal;tarım;mezőgazdasági;mezőgazdaság;poljedelski;poljedelstvo;põllumajandus;põllumajanduslik; +key::56;information;informazione;información;informação;Information;informatie;πληροφορία;bilgi;információ;informacija;informatsioon; +key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινωνικός;κοινωνική;κοινωνικό;κοινωνικά;sosyal;szociális;družbeni;sotsiaal;sotsiaalne; +key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;; +key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri; +key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline; +key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu; +key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu; +key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid; +key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus; +key::65;therapeutics;terapeutica;terapéutica;terapêutica;therapie;θεραπευτική;tedavibilimi;gyógykezelés;terapevtika;terapeutiline;ravi; +key::66;oncology;oncologia;oncologico;oncología;oncologia;Onkologie;oncologie;ογκολογία;onkoloji;onkológia;onkologija;onkoloogia; +key::67;natural;naturali;naturale;natural;natural;natürlich;natuurlijk;φυσικός;φυσική;φυσικό;φυσικά;doğal;természetes;naraven;loodus; +key::68;educational;educazione;pedagogia;educacional;educativo;educacional;pädagogisch;educatief;εκπαιδευτικός;εκπαιδευτική;εκπαιδευτικό;εκπαιδευτικά;eğitimsel;oktatási;izobraževalen;haridus;hariduslik; +key::69;biomedical;biomedica;biomédico;biomédico;biomedizinisch;biomedisch;βιοιατρικός;βιοιατρική;βιοιατρικό;βιοιατρικά;biyomedikal;orvosbiológiai;biomedicinski;biomeditsiiniline; +key::70;veterinary;veterinaria;veterinarie;veterinaria;veterinária;tierärtzlich;veterinair;veeartsenijlkunde;κτηνιατρικός;κτηνιατρική;κτηνιατρικό;κτηνιατρικά;veteriner;állatorvosi;veterinar;veterinarski;veterinaaria; +key::71;chemistry;chimica;química;química;Chemie;chemie;scheikunde;χημεία;kimya;kémia;kemija;keemia; +key::72;security;sicurezza;seguridad;segurança;Sicherheit;veiligheid;ασφάλεια;güvenlik;biztonsági;varnost;turvalisus;julgeolek; +key::73;biotechnology;biotecnologia;biotecnologie;biotecnología;biotecnologia;Biotechnologie;biotechnologie;βιοτεχνολογία;biyoteknoloji;biotechnológia;biotehnologija;biotehnoloogia; +key::74;military;militare;militari;militar;militar;Militär;militair;leger;στρατιωτικός;στρατιωτική;στρατιωτικό;στρατιωτικά;askeri;katonai;vojaški;vojni;militaar; +key::75;theological;teologia;teologico;teológico;tecnológica;theologisch;theologisch;θεολογικός;θεολογική;θεολογικό;θεολογικά;teolojik;technológiai;teološki;teoloogia;usuteadus;teoloogiline; +key::76;electronics;elettronica;electrónica;eletrônicos;Elektronik;elektronica;ηλεκτρονική;elektronik;elektronika;elektronika;elektroonika; +key::77;forestry;forestale;forestali;silvicultura;forestal;floresta;Forstwirtschaft;bosbouw;δασοκομία;δασολογία;ormancılık;erdészet;gozdarstvo;metsandus; +key::78;maritime;marittima;marittime;marittimo;marítimo;marítimo;maritiem;ναυτικός;ναυτική;ναυτικό;ναυτικά;ναυτιλιακός;ναυτιλιακή;ναυτιλιακό;ναυτιλιακά;θαλάσσιος;θαλάσσια;θαλάσσιο;denizcilik;tengeri;morski;mere;merendus; +key::79;sports;sport;deportes;esportes;Sport;sport;sportwetenschappen;άθληση;γυμναστικήδραστηριότητα;spor;sport;šport;sport;spordi; +key::80;surgery;chirurgia;chirurgiche;cirugía;cirurgia;Chirurgie;chirurgie;heelkunde;εγχείρηση;επέμβαση;χειρουργικήεπέμβαση;cerrahi;sebészet;kirurgija;kirurgia; +key::81;cultural;culturale;culturali;cultura;cultural;cultural;kulturell;cultureel;πολιτιστικός;πολιτιστική;πολιτιστικό;πολιτισμικός;πολιτισμική;πολιτισμικό;kültürel;kultúrális;kulturni;kultuuri;kultuuriline; +key::82;computerscience;informatica;ordenador;computadora;informática;computación;cienciasdelacomputación;ciênciadacomputação;Computer;computer;υπολογιστής;ηλεκτρονικόςυπολογιστής;bilgisayar;számítógép;računalnik;arvuti; +key::83;finance;financial;finanza;finanziarie;finanza;financiero;finanças;financeiro;Finanzen;finanziell;financiën;financieel;χρηματοοικονομικά;χρηματοδότηση;finanse;finansal;pénzügy;pénzügyi;finance;finančni;finants;finantsiline; +key::84;communication;comunicazione;comuniciación;comunicação;Kommunikation;communication;επικοινωνία;iletişim;kommunikáció;komuniciranje;kommunikatsioon; +key::85;justice;giustizia;justicia;justiça;Recht;Justiz;justitie;gerechtigheid;δικαιοσύνη;υπουργείοδικαιοσύνης;δίκαιο;adalet;igazságügy;pravo;õigus; +key::86;aerospace;aerospaziale;aerospaziali;aeroespacio;aeroespaço;Luftfahrt;luchtvaart;ruimtevaart;αεροπορικός;αεροπορική;αεροπορικό;αεροναυπηγικός;αεροναυπηγική;αεροναυπηγικό;αεροναυπηγικά;havacılıkveuzay;légtér;zrakoplovstvo;atmosfäär;kosmos; +key::87;dermatology;dermatologia;dermatología;dermatologia;Dermatologie;dermatologie;δρματολογία;dermatoloji;bőrgyógyászat;dermatológia;dermatologija;dermatoloogia; +key::88;architecture;architettura;arquitectura;arquitetura;Architektur;architectuur;αρχιτεκτονική;mimarlık;építészet;arhitektura;arhitektuur; +key::89;mathematics;matematica;matematiche;matemáticas;matemáticas;Mathematik;wiskunde;mathematica;μαθηματικά;matematik;matematika;matematika;matemaatika; +key::90;language;lingue;linguistica;linguistiche;lenguaje;idioma;língua;idioma;Sprache;taal;taalkunde;γλώσσα;dil;nyelv;jezik;keel; +key::91;neuroscience;neuroscienza;neurociencia;neurociência;Neurowissenschaft;neurowetenschappen;νευροεπιστήμη;nörobilim;idegtudomány;nevroznanost;neuroteadused; +key::92;automation;automazione;automatización;automação;Automatisierung;automatisering;αυτοματοποίηση;otomasyon;automatizálás;avtomatizacija;automatiseeritud; +key::93;pediatric;pediatria;pediatriche;pediatrico;pediátrico;pediatría;pediátrico;pediatria;pädiatrisch;pediatrische;παιδιατρική;pediatrik;gyermekgyógyászat;pediatrija;pediaatria; +key::94;photonics;fotonica;fotoniche;fotónica;fotônica;Photonik;fotonica;φωτονική;fotonik;fotonika;fotonika;fotoonika; +key::95;mechanics;meccanica;meccaniche;mecánica;mecânica;Mechanik;Maschinenbau;mechanica;werktuigkunde;μηχανικής;mekanik;gépészet;mehanika;mehaanika; +key::96;psychiatrics;psichiatria;psichiatrica;psichiatriche;psiquiatría;psiquiatria;Psychiatrie;psychiatrie;ψυχιατρική;psikiyatrik;pszihiátria;psihiatrija;psühhaatria; +key::97;psychology;fisiologia;psicología;psicologia;Psychologie;psychologie;ψυχολογία;psikoloji;pszihológia;psihologija;psühholoogia; +key::98;automotive;industriaautomobilistica;industriadelautomóvil;automotriz;industriaautomotriz;automotivo;Automobilindustrie;autoindustrie;αυτοκίνητος;αυτοκίνητη;αυτοκίνητο;αυτοκινούμενος;αυτοκινούμενη;αυτοκινούμενο;αυτοκινητιστικός;αυτοκινητιστική;αυτοκινητιστικό;otomotiv;autóipari;samogiben;avtomobilskaindustrija;auto-; +key::99;neurology;neurologia;neurologiche;neurología;neurologia;Neurologie;neurologie;zenuwleer;νευρολογία;nöroloji;neurológia;ideggyógyászat;nevrologija;neuroloogia; +key::100;geology;geologia;geologiche;geología;geologia;Geologie;geologie;aardkunde;γεωλογία;jeoloji;geológia;földtudomány;geologija;geoloogia; +key::101;microbiology;microbiologia;micro-biologia;microbiologiche;microbiología;microbiologia;Mikrobiologie;microbiologie;μικροβιολογία;mikrobiyoloji;mikrobiológia;mikrobiologija;mikrobioloogia; +key::102;informatics;informatica;informática;informática;informatica; +key:103;forschungsgemeinschaft;comunita ricerca;research community;research foundation;research association diff --git a/dnet-pace-core/src/test/java/DedupTestIT.java b/dnet-pace-core/src/test/java/DedupTestIT.java new file mode 100644 index 0000000..f9f1ed3 --- /dev/null +++ b/dnet-pace-core/src/test/java/DedupTestIT.java @@ -0,0 +1,4 @@ +public class DedupTestIT { + + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 883dde5..c92c6fe 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -10,6 +10,7 @@ import java.util.HashMap; import java.util.Map; import static junit.framework.Assert.assertEquals; +import static junit.framework.Assert.assertTrue; public class DistanceAlgoTest extends AbstractPaceFunctions { @@ -61,5 +62,45 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { assertEquals(result, 1.0); } + @Test + public void testJaroWinklerNormalizedName3() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); + + System.out.println("result = " + result); + assertEquals(result, 0.0); + } + + @Test + public void testJaroWinklerNormalizedName4() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); + + System.out.println("result = " + result); + assertEquals(result, 1.0); + } + + @Test + public void testJaroWinklerNormalizedName5() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); + + System.out.println("result = " + result); + assertEquals(result, 1.0); + } + + @Test + public void testJaroWinklerNormalizedName6() { + + final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); + double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); + + System.out.println("result = " + result); + assertTrue(result> 0.9); + + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java deleted file mode 100644 index 240a5d6..0000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/tree/ComparatorTest.java +++ /dev/null @@ -1,144 +0,0 @@ -package eu.dnetlib.pace.tree; - -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - -import static junit.framework.Assert.assertEquals; -import static junit.framework.Assert.assertTrue; - -//test class for comparators (to be used into the tree nodes) -public class ComparatorTest extends AbstractPaceTest { - - private Map params; - - @Before - public void setup() { - params = new HashMap<>(); - //to put all the needed parameters - params.put("minCoauthors", 5); - params.put("maxCoauthors", 200); - - } - - @Test - public void testCoauthorsMatch() { - - final CoauthorsMatch coauthorsMatch = new CoauthorsMatch(params); - - Field a = createFieldList(Arrays.asList("la bruzzo, sandro", "atzori, claudio", "artini, michele", "de bonis, michele", "bardi, alessia", "dell'amico, andrea", "baglioni, miriam"), "coauthors"); - Field b = createFieldList(Arrays.asList("la bruzzo, sandro"), "coauthors"); - - double result1 = coauthorsMatch.compare(a, b); - double result2 = coauthorsMatch.compare(a, a); - - System.out.println("a = " + a); - System.out.println("b = " + b); - - System.out.println("a vs b = " + result1); - System.out.println("a vs a = " + result2); - - assertEquals(result1, -1.0); - assertEquals(result2, 7.0); - } - - @Test - public void testExactMatch() { - - final ExactMatch exactMatch = new ExactMatch(params); - - Field a = new FieldValueImpl(Type.String, "doi", "10.1000/0000000000"); - Field b = new FieldValueImpl(Type.String, "doi", "10.1033/0000000000"); - Field c = new FieldValueImpl(Type.String, "doi", ""); - - double result1 = exactMatch.compare(a,a); - double result2 = exactMatch.compare(a,b); - double result3 = exactMatch.compare(a,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs a = " + result1); - System.out.println("a vs b = " + result2); - System.out.println("a vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result2, 0.0); - assertEquals(result3, -1.0); - - } - - @Test - public void testSimilarMatch() { - - final SimilarMatch similarMatch = new SimilarMatch(params); - - Field a = new FieldValueImpl(Type.String, "firstname", "sandro"); - Field b = new FieldValueImpl(Type.String, "firstname", "s."); - Field c = new FieldValueImpl(Type.String, "firstname", "stefano"); - - double result1 = similarMatch.compare(a,b); - double result2 = similarMatch.compare(a,c); - double result3 = similarMatch.compare(b,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs b = " + result1); - System.out.println("a vs c = " + result2); - System.out.println("b vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result3, 1.0); - assertTrue(result2<0.7); - - } - - @Test - public void testTopicsMatch() { - - final TopicsMatch topicsMatch = new TopicsMatch(params); - - Field a = createFieldList(Arrays.asList("0.0", "1.0", "0.0"), "topics"); - Field b = createFieldList(Arrays.asList("0.0", "0.0", "1.0"), "topics"); - Field c = createFieldList(Arrays.asList("0.5", "0.5", "0.0"), "topics"); - - double result1 = topicsMatch.compare(a,a); - double result2 = topicsMatch.compare(a,c); - double result3 = topicsMatch.compare(b,c); - - System.out.println("a = " + a); - System.out.println("b = " + b); - System.out.println("c = " + c); - - System.out.println("a vs a = " + result1); - System.out.println("a vs c = " + result2); - System.out.println("b vs c = " + result3); - - assertEquals(result1, 1.0); - assertEquals(result2, 0.5); - assertEquals(result3, 0.0); - - } - - @Test - public void testUndefinedNode() { - - final UndefinedNode undefinedNode = new UndefinedNode(); - double result = undefinedNode.compare(new FieldListImpl(),new FieldListImpl()); - - assertEquals(result, 0.0); - } - - -}