minor changes and update of the configuration for publications

This commit is contained in:
miconis 2020-01-14 15:01:03 +02:00
parent dd21db7036
commit 9bdcb02179
5 changed files with 54 additions and 76 deletions

View File

@ -52,7 +52,7 @@ public class SparkCreateConnectedComponent {
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, 20).toJavaRDD();
final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
c.getDocIds()
.stream()

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dedup;
import eu.dnetlib.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
@ -12,8 +11,6 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
@ -44,30 +41,23 @@ public class SparkCreateSimRels {
final String inputPath = parser.get("sourcePath");
final String entity = parser.get("entity");
final String targetPath = parser.get("targetPath");
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final long total = sc.textFile(inputPath + "/" + entity).count();
JavaPairRDD<Object, MapDocument> vertexes = sc.textFile(inputPath + "/" + entity)
.map(s->{
JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
.mapToPair(s->{
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s);
return new Tuple2<>(d.getIdentifier(), d);})
.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2()));
JavaPairRDD<String, MapDocument> mapDocument = vertexes.mapToPair((PairFunction<Tuple2<Object, MapDocument>, String, MapDocument>) item -> new Tuple2<String, MapDocument>(item._2().getIdentifier(), item._2()));
return new Tuple2<>(d.getIdentifier(), d);});
//create blocks for deduplication
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc,mapDocument, dedupConf);
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
// JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf);
//create relations by comparing only elements in the same group
final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
// final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks, dedupConf);
final JavaRDD<Relation> isSimilarToRDD = dedupRels.map(simRel -> {
final Relation r = new Relation();
@ -79,17 +69,5 @@ public class SparkCreateSimRels {
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
}
}
}

View File

@ -18,11 +18,11 @@ import java.util.List;
public class SparkCreateDedupTest {
String configuration;
String entity = "publication";
String entity = "organization";
@Before
public void setUp() throws IOException {
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json"));
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
}
@ -31,7 +31,7 @@ public class SparkCreateDedupTest {
public void createSimRelsTest() throws Exception {
SparkCreateSimRels.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration),
"-t", "/tmp/dedup",
@ -44,7 +44,7 @@ public class SparkCreateDedupTest {
SparkCreateConnectedComponent.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration),
"-t", "/tmp/dedup",
@ -56,7 +56,7 @@ public class SparkCreateDedupTest {
public void dedupRecordTest() throws Exception {
SparkCreateDedupRecord.main(new String[] {
"-mt", "local[*]",
"-s", "/home/sandro/betadump",
"-s", "/Users/miconis/dumps",
"-e", entity,
"-c", ArgumentApplicationParser.compressArgument(configuration),
"-d", "/tmp/dedup",
@ -64,26 +64,10 @@ public class SparkCreateDedupTest {
}
@Test
public void printCC() throws Exception {
public void printConfiguration() throws Exception {
System.out.println(ArgumentApplicationParser.compressArgument(configuration));
}
// [20|grid________::6031f94bef015a37783268ec1e75f17f, 20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46]
// [20|grid________::672e1e5cef49e68f124d3da5225a7357, 20|grid________::7a402604c3853c7a0af14f88f56bf7e1]
// [20|grid________::2fc05b35e11d915b220a66356053eae2, 20|grid________::b02fb3176eb38f6c572722550c07e7ab]
// [20|grid________::bc86248ab2b8d7955dcaf592ba342262, 20|corda_______::45a8ec964029278fb938805182e247a8]
// [20|doajarticles::74551f800ad1c81a6cd31c5162887b7f, 20|rcuk________::86dc9a83df05a58917f38ca09f814617]
// [20|nsf_________::5e837d8e6444cc298db314ea54ad2f4a, 20|snsf________::7b54715f0ec5c6a0a44672f45d98be8d]
// [20|corda__h2020::7ee7e57bad06b92c1a568dd61e10ba8c, 20|snsf________::2d4a2695221a3ce0c749ee34e064c0b3]
// [20|corda_______::25220a523550176dac9e5432dac43596, 20|grid________::9782f16a46650cbbfaaa2315109507d1]
// [20|nih_________::88c3b664dcc7af9e827f94ac964cd66c, 20|grid________::238d3ac0a7d119d5c8342a647f5245f5]
// [20|rcuk________::0582c20fcfb270f9ec1b19b0f0dcd881, 20|nsf_________::9afa48ddf0bc2cd4f3c41dc41daabcdb]
// [20|rcuk________::fbc445f8d24e569bc8b640dba86ae978, 20|corda_______::5a8a4094f1b68a88fc56e65cea7ebfa0]
// [20|rcuk________::7485257cd5caaf6316ba8062feea801d, 20|grid________::dded811e5f5a4c9f7ca8f9955e52ade7]
// [20|nih_________::0576dd270d29d5b7c23dd15a827ccdb9, 20|corda_______::10ca69f6a4a121f75fdde1feee226ce0]
// [20|corda__h2020::0429f6addf10e9b2939d65c6fb097ffd, 20|grid________::6563ec73057624d5ccc0cd050b302181]
@Test
public void testHashCode() {
final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";

View File

@ -7,9 +7,9 @@
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"idPath": "$.id",
"maxIterations": "20"
},
"pace" : {
@ -31,7 +31,7 @@
}
],
"threshold": 1,
"aggregation": "SC",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "layer2",
@ -52,10 +52,24 @@
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"comparator": "numbersMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"comparator": "romansMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1,
"aggregation": "NC",
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
@ -69,12 +83,11 @@
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4",
"threshold": "0.0"
"windowSize": "4"
}
}
],
"threshold": 1.0,
"threshold": 0.7,
"aggregation": "W_MEAN",
"positive": "layer4",
"negative": "NO_MATCH",
@ -87,19 +100,18 @@
"field": "legalname",
"comparator": "keywordMatch",
"weight": 1.0,
"countIfUndefined": "false",
"countIfUndefined": "true",
"params": {
"windowSize": "4",
"threshold": "0.7"
"windowSize": "4"
}
}
],
"threshold": 1.0,
"aggregation": "W_MEAN",
"threshold": 0.9,
"aggregation": "AVG",
"positive": "layer5",
"negative": "NO_MATCH",
"undefined": "layer5",
"ignoreUndefined": "false"
"ignoreUndefined": "true"
},
"layer5": {
"fields": [
@ -133,19 +145,20 @@
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"}
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
@ -233,7 +246,7 @@
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
@ -243,7 +256,11 @@
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "statale", "etat", "zustand", "estado"]
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f"],
"key::108" : ["agricultural mechanical", "am", "a m"],
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
}
}
}

View File

@ -66,14 +66,13 @@
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"threshold": "0.5",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
@ -97,7 +96,7 @@
}
],
"threshold": 1.0,
"aggregation": "NC",
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
@ -114,7 +113,7 @@
}
],
"threshold": 0.99,
"aggregation": "SUM",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",