From 9bdcb02179496aa2f3c989adfe330160deba16f3 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 14 Jan 2020 15:01:03 +0200 Subject: [PATCH] minor changes and update of the configuration for publications --- .../dedup/SparkCreateConnectedComponent.java | 2 +- .../eu/dnetlib/dedup/SparkCreateSimRels.java | 40 ++++----------- .../dnetlib/dedup/SparkCreateDedupTest.java | 28 +++------- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 51 ++++++++++++------- .../dnetlib/dedup/conf/pub_dt.curr.conf.json | 9 ++-- 5 files changed, 54 insertions(+), 76 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 16e112c252..e0549dfbf7 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -52,7 +52,7 @@ public class SparkCreateConnectedComponent { final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); + final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, 20).toJavaRDD(); final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> c.getDocIds() .stream() diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index 48d442d046..831e45daf4 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,7 +1,6 @@ package eu.dnetlib.dedup; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; +import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.pace.config.DedupConfig; @@ -12,8 +11,6 @@ import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import scala.Tuple2; @@ -44,30 +41,23 @@ public class SparkCreateSimRels { final String inputPath = parser.get("sourcePath"); final String entity = parser.get("entity"); final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); +// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final long total = sc.textFile(inputPath + "/" + entity).count(); - JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) - .map(s->{ + JavaPairRDD mapDocument = sc.textFile(inputPath + "/" + entity) + .mapToPair(s->{ MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s); - return new Tuple2<>(d.getIdentifier(), d);}) - .mapToPair((PairFunction, Object, MapDocument>) t -> new Tuple2((long) t._1().hashCode(), t._2())); - - - - - JavaPairRDD mapDocument = vertexes.mapToPair((PairFunction, String, MapDocument>) item -> new Tuple2(item._2().getIdentifier(), item._2())); + return new Tuple2<>(d.getIdentifier(), d);}); //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc,mapDocument, dedupConf); - + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); +// JavaPairRDD> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf); //create relations by comparing only elements in the same group final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - +// final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, dedupConf); final JavaRDD isSimilarToRDD = dedupRels.map(simRel -> { final Relation r = new Relation(); @@ -79,17 +69,5 @@ public class SparkCreateSimRels { spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); - - - - - - - - - } - - - -} +} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index f294b10fe8..f93703e377 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -18,11 +18,11 @@ import java.util.List; public class SparkCreateDedupTest { String configuration; - String entity = "publication"; + String entity = "organization"; @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json")); + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); } @@ -31,7 +31,7 @@ public class SparkCreateDedupTest { public void createSimRelsTest() throws Exception { SparkCreateSimRels.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", entity, "-c", ArgumentApplicationParser.compressArgument(configuration), "-t", "/tmp/dedup", @@ -44,7 +44,7 @@ public class SparkCreateDedupTest { SparkCreateConnectedComponent.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", entity, "-c", ArgumentApplicationParser.compressArgument(configuration), "-t", "/tmp/dedup", @@ -56,7 +56,7 @@ public class SparkCreateDedupTest { public void dedupRecordTest() throws Exception { SparkCreateDedupRecord.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", entity, "-c", ArgumentApplicationParser.compressArgument(configuration), "-d", "/tmp/dedup", @@ -64,26 +64,10 @@ public class SparkCreateDedupTest { } @Test - public void printCC() throws Exception { + public void printConfiguration() throws Exception { System.out.println(ArgumentApplicationParser.compressArgument(configuration)); } - -// [20|grid________::6031f94bef015a37783268ec1e75f17f, 20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46] -// [20|grid________::672e1e5cef49e68f124d3da5225a7357, 20|grid________::7a402604c3853c7a0af14f88f56bf7e1] -// [20|grid________::2fc05b35e11d915b220a66356053eae2, 20|grid________::b02fb3176eb38f6c572722550c07e7ab] -// [20|grid________::bc86248ab2b8d7955dcaf592ba342262, 20|corda_______::45a8ec964029278fb938805182e247a8] -// [20|doajarticles::74551f800ad1c81a6cd31c5162887b7f, 20|rcuk________::86dc9a83df05a58917f38ca09f814617] -// [20|nsf_________::5e837d8e6444cc298db314ea54ad2f4a, 20|snsf________::7b54715f0ec5c6a0a44672f45d98be8d] -// [20|corda__h2020::7ee7e57bad06b92c1a568dd61e10ba8c, 20|snsf________::2d4a2695221a3ce0c749ee34e064c0b3] -// [20|corda_______::25220a523550176dac9e5432dac43596, 20|grid________::9782f16a46650cbbfaaa2315109507d1] -// [20|nih_________::88c3b664dcc7af9e827f94ac964cd66c, 20|grid________::238d3ac0a7d119d5c8342a647f5245f5] -// [20|rcuk________::0582c20fcfb270f9ec1b19b0f0dcd881, 20|nsf_________::9afa48ddf0bc2cd4f3c41dc41daabcdb] -// [20|rcuk________::fbc445f8d24e569bc8b640dba86ae978, 20|corda_______::5a8a4094f1b68a88fc56e65cea7ebfa0] -// [20|rcuk________::7485257cd5caaf6316ba8062feea801d, 20|grid________::dded811e5f5a4c9f7ca8f9955e52ade7] -// [20|nih_________::0576dd270d29d5b7c23dd15a827ccdb9, 20|corda_______::10ca69f6a4a121f75fdde1feee226ce0] -// [20|corda__h2020::0429f6addf10e9b2939d65c6fb097ffd, 20|grid________::6563ec73057624d5ccc0cd050b302181] - @Test public void testHashCode() { final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f"; diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 43003c2e09..2d09055626 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -7,9 +7,9 @@ "queueMaxSize" : "2000", "groupMaxSize" : "50", "slidingWindowSize" : "200", + "idPath":"$.id", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true", - "idPath": "$.id", "maxIterations": "20" }, "pace" : { @@ -31,7 +31,7 @@ } ], "threshold": 1, - "aggregation": "SC", + "aggregation": "AVG", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "layer2", @@ -52,10 +52,24 @@ "weight": 1, "countIfUndefined": "true", "params": {} + }, + { + "field": "legalname", + "comparator": "numbersMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "legalname", + "comparator": "romansMatch", + "weight": 1, + "countIfUndefined": "true", + "params": {} } ], "threshold": 1, - "aggregation": "NC", + "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", @@ -69,12 +83,11 @@ "weight": 1.0, "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.0" + "windowSize": "4" } } ], - "threshold": 1.0, + "threshold": 0.7, "aggregation": "W_MEAN", "positive": "layer4", "negative": "NO_MATCH", @@ -87,19 +100,18 @@ "field": "legalname", "comparator": "keywordMatch", "weight": 1.0, - "countIfUndefined": "false", + "countIfUndefined": "true", "params": { - "windowSize": "4", - "threshold": "0.7" + "windowSize": "4" } } ], - "threshold": 1.0, - "aggregation": "W_MEAN", + "threshold": 0.9, + "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", "undefined": "layer5", - "ignoreUndefined": "false" + "ignoreUndefined": "true" }, "layer5": { "fields": [ @@ -133,19 +145,20 @@ { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], - "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"], "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], @@ -233,7 +246,7 @@ "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], - "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], @@ -243,7 +256,11 @@ "key::102": ["informatics","informatica","informática","informática","informatica",""], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"], - "key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"] + "key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"], + "key::106" : ["seminary", "seminario", "seminaire", "seminar"], + "key::107" : ["agricultural forestry", "af", "a f"], + "key::108" : ["agricultural mechanical", "am", "a m"], + "key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"] } } } \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json index 18b048e9e8..6ca0ecd531 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json @@ -66,14 +66,13 @@ "weight": 1.0, "countIfUndefined": "false", "params": { - "threshold": "0.5", "jpath_value": "$.value", "jpath_classid": "$.qualifier.classid" } } ], - "threshold": 1.0, - "aggregation": "MAX", + "threshold": 0.5, + "aggregation": "AVG", "positive": "MATCH", "negative": "layer2", "undefined": "layer2", @@ -97,7 +96,7 @@ } ], "threshold": 1.0, - "aggregation": "NC", + "aggregation": "AND", "positive": "layer3", "negative": "NO_MATCH", "undefined": "layer3", @@ -114,7 +113,7 @@ } ], "threshold": 0.99, - "aggregation": "SUM", + "aggregation": "AVG", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "NO_MATCH",