From f32eae5ce9e130c0d1ddd893696d20a773b4f33f Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 18 Mar 2020 14:27:49 +0100 Subject: [PATCH 01/17] implementation of the spark action for the simrel creation --- dhp-workflows/dhp-dedup/pom.xml | 6 +- .../eu/dnetlib/dedup/SparkCreateSimRels2.java | 142 +++++++ .../dnetlib/dhp/dedup/dedup_parameters.json | 27 +- .../dhp/dedup/oozie_app/DuplicateScanWf.xml | 88 ++++ .../dnetlib/dedup/SparkCreateDedupTest.java | 26 +- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 14 +- .../eu/dnetlib/dedup/conf/pub.curr.conf.json | 168 ++++++-- .../dnetlib/dedup/conf/pub_dt.curr.conf.json | 386 ------------------ pom.xml | 16 + 9 files changed, 426 insertions(+), 447 deletions(-) create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml delete mode 100644 dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 0721af25dd..cc27952fa4 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -82,8 +82,10 @@ com.fasterxml.jackson.core jackson-core - - + + eu.dnetlib + dnet-actionmanager-common + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java new file mode 100644 index 0000000000..3fa7be3f78 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java @@ -0,0 +1,142 @@ +package eu.dnetlib.dedup; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; +import eu.dnetlib.actionmanager.actions.AtomicAction; +import eu.dnetlib.actionmanager.common.Agent; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +public class SparkCreateSimRels2 implements Serializable { + + final static String CONF_SEPARATOR = "@@@"; + + private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class); + + public static List decompressConfs(String compressedConfs){ + + return Arrays.stream(compressedConfs.split(CONF_SEPARATOR)) + .map(ArgumentApplicationParser::decompressValue) + .map(DedupConfig::load) + .collect(Collectors.toList()); + } + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); + + parser.parseArgument(args); + + new SparkCreateSimRels2().run(parser, decompressConfs(parser.get("dedupConf"))); + } + + private void run(ArgumentApplicationParser parser, List dedupConfs) { + + //read oozie parameters + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String rawSetName = parser.get("rawSet"); + final String agentId = parser.get("agentId"); + final String agentName = parser.get("agentName"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //create empty sequenceFile for the accumulation + JavaRDD> simRel = sc.emptyRDD(); + + //for each dedup configuration + for (DedupConfig dedupConf: dedupConfs) { + final String entity = dedupConf.getWf().getEntityType(); + + JavaPairRDD mapDocument = sc.textFile(sourcePath + "/" + entity) + .mapToPair(s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + //create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + + //create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2())); + + //create atomic actions + JavaRDD> newSimRels = relationsRDD + .mapToPair(rel -> + new Tuple2<>( + createActionId(rel.getSource(), rel.getTarget(), entity), //TODO update the type, maybe take it from the configuration? + new AtomicAction(rawSetName, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes()))) + .map(aa -> new Tuple2<>(aa._1(), transformAction(aa._2()))); + + simRel = simRel.union(newSimRels); + + } + + String targetDirectory = targetPath + "/" + rawSetName; + +// simRel.map(s -> s._1().toString()).saveAsTextFile(targetDirectory); + + simRel.mapToPair(r -> r) + .saveAsHadoopFile(targetDirectory, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + + } + + } + + public Text createActionId(String source, String target, String type) { + String id = source + "@" + type + "@" + target; + + return new Text(id); + } + + public Text transformAction(AtomicAction aa) throws JsonProcessingException { + + ObjectMapper mapper = new ObjectMapper(); + + return new Text(mapper.writeValueAsString(aa)); + } + + public Relation createSimRel(String source, String target){ + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass("isSimilarTo"); + return r; + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels2.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) +// .enableHiveSupport() + .getOrCreate(); + } + +} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json index 8ba8515d0e..9bdddef8a8 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json @@ -8,26 +8,43 @@ { "paramName": "s", "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", + "paramDescription": "the base path of the raw graph", "paramRequired": true }, { "paramName": "e", "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", + "paramDescription": "the type of entity to be deduped (directory in the sourcePath)", "paramRequired": true }, { "paramName": "c", "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, + "paramDescription": "list of dedup configuration to be used", "paramRequired": true }, { "paramName": "t", "paramLongName": "targetPath", - "paramDescription": "target path to save dedup result", + "paramDescription": "target base path to save dedup result (actions)", + "paramRequired": true + }, + { + "paramName": "rs", + "paramLongName": "rawSet", + "paramDescription": "the raw set to be saved (directory in the targetPath)", + "paramRequired": true + }, + { + "paramName": "ai", + "paramLongName": "agentId", + "paramDescription": "the agent identifier", + "paramRequired": true + }, + { + "paramName": "an", + "paramLongName": "agentName", + "paramDescription": "the agent name", "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml new file mode 100644 index 0000000000..1dede2c70d --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml @@ -0,0 +1,88 @@ + + + + sourcePath + the raw graph base path + + + entity + the entity that should be processed + + + dedupConf + the (list of) dedup Configuration(s) + + + targetPath + the output base path + + + rawSet + the output directory in the targetPath + + + agentId + the agent identifier + + + agentName + the agent name + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Similarity Relations + eu.dnetlib.dedup.SparkCreateSimRels2 + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --sourcePath${sourcePath} + --targetPath${targetPath} + --entity${entity} + --dedupConf${dedupConf} + --rawSet${rawSet} + --agentId${agentId} + --agentName${agentName} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index f93703e377..12bba7c1e9 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -13,17 +13,20 @@ import org.junit.Test; import java.io.File; import java.io.IOException; +import java.util.HashSet; import java.util.List; +import java.util.Set; public class SparkCreateDedupTest { String configuration; - String entity = "organization"; + String configuration2; + String entity = "publication"; @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); - + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org1.curr.conf.json")); + configuration2 = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org2.curr.conf.json")); } @Test @@ -38,6 +41,21 @@ public class SparkCreateDedupTest { }); } + @Test + @Ignore + public void createSimRelsTest2() throws Exception { + SparkCreateSimRels2.main(new String[] { + "-mt", "local[*]", + "-s", "/Users/miconis/dumps", + "-e", entity, + "-c", ArgumentApplicationParser.compressArgument(configuration) + "@@@" + ArgumentApplicationParser.compressArgument(configuration2), + "-t", "/tmp/dedup", + "-rs", "rawset_test", + "-ai", "agentId", + "-an", "agentName" + }); + } + @Test @Ignore public void createCCTest() throws Exception { @@ -79,8 +97,6 @@ public class SparkCreateDedupTest { System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); System.out.println( s2.hashCode()); System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); - } - } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 2d09055626..31b200c725 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -87,8 +87,8 @@ } } ], - "threshold": 0.7, - "aggregation": "W_MEAN", + "threshold": 0.1, + "aggregation": "AVG", "positive": "layer4", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -106,7 +106,7 @@ } } ], - "threshold": 0.9, + "threshold": 0.7, "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", @@ -129,7 +129,9 @@ "comparator": "jaroWinklerNormalizedName", "weight": 0.1, "countIfUndefined": "false", - "params": {} + "params": { + "windowSize": 4 + } } ], "threshold": 0.9, @@ -145,14 +147,14 @@ { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json index 3e861fb715..d471ccb89d 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json @@ -1,42 +1,134 @@ { - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "subEntityType" : "resulttype", - "subEntityValue" : "publication", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "100", - "maxChildren" : "100", - "idPath": "$.id", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], - "includeChildren" : "true" + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "2000", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "200", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" }, - "pace" : { + "pace": { "clustering" : [ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], - "strictConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + } + ], + "threshold": 0.5, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.pid", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } ], - "conditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } - ], - "model" : [ - { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" }, - { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 }, - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 }, - { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" } - ], - "synonyms": {}, - "blacklists" : { - "title" : [ + "blacklists": { + "title": [ "^Inside Front Cover$", "(?i)^Poster presentations$", "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", @@ -48,7 +140,6 @@ "^Cartas? ao editor Letters? to the Editor$", "^Note from the Editor$", "^Anesthesia Abstract$", - "^Annual report$", "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", "(?i)^Graph and Table of Infectious Diseases?$", @@ -68,14 +159,12 @@ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", "^Gushi hakubutsugaku$", - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", "^Intestinal spirocha?etosis$", "^Treatment of Rodent Ulcer$", "(?i)^\\W*Cloud Computing\\W*$", "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", "^Free Communications, Poster Presentations: Session [A-F]$", - "^“The Historical Aspects? of Quackery\\.?”$", "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", @@ -96,10 +185,8 @@ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", "^Aus der AGMB$", - "^Znanstveno-stručni prilozi$", "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", @@ -136,7 +223,6 @@ "(?i)^RUBRIKA UREDNIKA$", "^A Matching Model of the Academic Publication Market$", "^Yōgaku kōyō$", - "^Internetový marketing$", "^Internet marketing$", "^Chūtō kokugo dokuhon$", @@ -169,21 +255,17 @@ "^Information System Assessment and Proposal for ICT Modification$", "^Stresové zatížení pracovníků ve vybrané profesi$", "^Stress load in a specific job$", - "^Sunday: Poster Sessions, Pt.*$", "^Monday: Poster Sessions, Pt.*$", "^Wednesday: Poster Sessions, Pt.*", "^Tuesday: Poster Sessions, Pt.*$", - "^Analýza reklamy$", "^Analysis of advertising$", - "^Shōgaku shūshinsho$", "^Shōgaku sansū$", "^Shintei joshi kokubun$", "^Taishō joshi kokubun dokuhon$", "^Joshi kokubun$", - "^Účetní uzávěrka a účetní závěrka v ČR$", "(?i)^The \"?Causes\"? of Cancer$", "^Normas para la publicación de artículos$", @@ -202,7 +284,6 @@ "^Abdominal [Aa]ortic [Aa]neurysms.*$", "^Pseudomyxoma peritonei$", "^Kazalo autora$", - "(?i)^uvodna riječ$", "^Motivace jako způsob vedení lidí$", "^Motivation as a leadership$", @@ -275,6 +356,7 @@ "(?i)^.*authors['’′]? reply\\.?$", "(?i)^.*authors['’′]? response\\.?$" ] - } + }, + "synonyms": {} } } \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json deleted file mode 100644 index 6ca0ecd531..0000000000 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json +++ /dev/null @@ -1,386 +0,0 @@ -{ - "wf": { - "threshold": "0.99", - "dedupRun": "001", - "entityType": "result", - "subEntityType": "resulttype", - "subEntityValue": "publication", - "orderField": "title", - "queueMaxSize": "2000", - "groupMaxSize": "100", - "maxChildren": "100", - "slidingWindowSize": "200", - "rootBuilder": [ - "result", - "resultProject_outcome_isProducedBy", - "resultResult_publicationDataset_isRelatedTo", - "resultResult_similarity_isAmongTopNSimilarDocuments", - "resultResult_similarity_hasAmongTopNSimilarDocuments", - "resultOrganization_affiliation_isAffiliatedWith", - "resultResult_part_hasPart", - "resultResult_part_isPartOf", - "resultResult_supplement_isSupplementTo", - "resultResult_supplement_isSupplementedBy", - "resultResult_version_isVersionOf" - ], - "includeChildren": "true", - "maxIterations": 20, - "idPath": "$.id" - }, - "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - }, - { - "name": "lowercase", - "fields": [ - "doi" - ], - "params": {} - } - ], - "decisionTree": { - "start": { - "fields": [ - { - "field": "pid", - "comparator": "jsonListMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": { - "jpath_value": "$.value", - "jpath_classid": "$.qualifier.classid" - } - } - ], - "threshold": 0.5, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "layer2", - "undefined": "layer2", - "ignoreUndefined": "true" - }, - "layer2": { - "fields": [ - { - "field": "title", - "comparator": "titleVersionMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - }, - { - "field": "authors", - "comparator": "sizeMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - } - ], - "threshold": 1.0, - "aggregation": "AND", - "positive": "layer3", - "negative": "NO_MATCH", - "undefined": "layer3", - "ignoreUndefined": "false" - }, - "layer3": { - "fields": [ - { - "field": "title", - "comparator": "levensteinTitle", - "weight": 1.0, - "countIfUndefined": "true", - "params": {} - } - ], - "threshold": 0.99, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "NO_MATCH", - "undefined": "NO_MATCH", - "ignoreUndefined": "true" - } - }, - "model": [ - { - "name": "doi", - "type": "String", - "path": "$.pid[?(@.qualifier.classid == 'doi')].value" - }, - { - "name": "pid", - "type": "JSON", - "path": "$.pid", - "overrideMatch": "true" - }, - { - "name": "title", - "type": "String", - "path": "$.title[?(@.qualifier.classid == 'main title')].value", - "length": 250, - "size": 5 - }, - { - "name": "authors", - "type": "List", - "path": "$.author[*].fullname", - "size": 200 - }, - { - "name": "resulttype", - "type": "String", - "path": "$.resulttype.classid" - } - ], - "blacklists": { - "title": [ - "^Inside Front Cover$", - "(?i)^Poster presentations$", - "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", - "^Problems with perinatal pathology\\.?$", - "(?i)^Cases? of Puerperal Convulsions$", - "(?i)^Operative Gyna?ecology$", - "(?i)^Mind the gap\\!?\\:?$", - "^Chronic fatigue syndrome\\.?$", - "^Cartas? ao editor Letters? to the Editor$", - "^Note from the Editor$", - "^Anesthesia Abstract$", - "^Annual report$", - "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", - "(?i)^Graph and Table of Infectious Diseases?$", - "^Presentation$", - "(?i)^Reviews and Information on Publications$", - "(?i)^PUBLIC HEALTH SERVICES?$", - "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", - "(?i)^Adrese autora$", - "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", - "(?i)^Acknowledgement to Referees$", - "(?i)^Behçet's disease\\.?$", - "(?i)^Isolation and identification of restriction endonuclease.*$", - "(?i)^CEREBROVASCULAR DISEASES?.?$", - "(?i)^Screening for abdominal aortic aneurysms?\\.?$", - "^Event management$", - "(?i)^Breakfast and Crohn's disease.*\\.?$", - "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", - "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", - "^Gushi hakubutsugaku$", - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", - "^Intestinal spirocha?etosis$", - "^Treatment of Rodent Ulcer$", - "(?i)^\\W*Cloud Computing\\W*$", - "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", - "^Free Communications, Poster Presentations: Session [A-F]$", - "^“The Historical Aspects? of Quackery\\.?”$", - "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", - "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", - "(?i)^Case Report$", - "^Boletín Informativo$", - "(?i)^Glioblastoma Multiforme$", - "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", - "^Zaměstnanecké výhody$", - "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", - "(?i)^Carotid body tumours?\\.?$", - "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", - "^Avant-propos$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", - "(?i)^PUBLIC HEALTH VERSUS THE STATE$", - "^Viñetas de Cortázar$", - "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", - "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", - "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", - "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", - "^Aus der AGMB$", - "^Znanstveno-stručni prilozi$", - "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", - "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", - "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", - "^Finanční analýza podniku$", - "^Financial analysis( of business)?$", - "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", - "^Jikken nihon shūshinsho$", - "(?i)^CORONER('|s)(s|') INQUESTS$", - "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", - "(?i)^Consultants' contract(s)?$", - "(?i)^Upute autorima$", - "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", - "^Joshi shin kokubun$", - "^Kōtō shōgaku dokuhon nōson'yō$", - "^Jinjō shōgaku shōka$", - "^Shōgaku shūjichō$", - "^Nihon joshi dokuhon$", - "^Joshi shin dokuhon$", - "^Chūtō kanbun dokuhon$", - "^Wabun dokuhon$", - "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", - "(?i)^cardiac rehabilitation$", - "(?i)^Analytical summary$", - "^Thesaurus resolutionum Sacrae Congregationis Concilii$", - "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", - "^Prikazi i osvrti$", - "^Rodinný dům s provozovnou$", - "^Family house with an establishment$", - "^Shinsei chūtō shin kokugun$", - "^Pulmonary alveolar proteinosis(\\.?)$", - "^Shinshū kanbun$", - "^Viñeta(s?) de Rodríguez$", - "(?i)^RUBRIKA UREDNIKA$", - "^A Matching Model of the Academic Publication Market$", - "^Yōgaku kōyō$", - "^Internetový marketing$", - "^Internet marketing$", - "^Chūtō kokugo dokuhon$", - "^Kokugo dokuhon$", - "^Antibiotic Cover for Dental Extraction(s?)$", - "^Strategie podniku$", - "^Strategy of an Enterprise$", - "(?i)^respiratory disease(s?)(\\.?)$", - "^Award(s?) for Gallantry in Civil Defence$", - "^Podniková kultura$", - "^Corporate Culture$", - "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", - "^Pracovní motivace$", - "^Work Motivation$", - "^Kaitei kōtō jogaku dokuhon$", - "^Konsolidovaná účetní závěrka$", - "^Consolidated Financial Statements$", - "(?i)^intracranial tumour(s?)$", - "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", - "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", - "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", - "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", - "^Úroveň motivačního procesu jako způsobu vedení lidí$", - "^The level of motivation process as a leadership$", - "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", - "(?i)^news and events$", - "(?i)^NOVOSTI I DOGAĐAJI$", - "^Sansū no gakushū$", - "^Posouzení informačního systému firmy a návrh změn$", - "^Information System Assessment and Proposal for ICT Modification$", - "^Stresové zatížení pracovníků ve vybrané profesi$", - "^Stress load in a specific job$", - "^Sunday: Poster Sessions, Pt.*$", - "^Monday: Poster Sessions, Pt.*$", - "^Wednesday: Poster Sessions, Pt.*", - "^Tuesday: Poster Sessions, Pt.*$", - "^Analýza reklamy$", - "^Analysis of advertising$", - "^Shōgaku shūshinsho$", - "^Shōgaku sansū$", - "^Shintei joshi kokubun$", - "^Taishō joshi kokubun dokuhon$", - "^Joshi kokubun$", - "^Účetní uzávěrka a účetní závěrka v ČR$", - "(?i)^The \"?Causes\"? of Cancer$", - "^Normas para la publicación de artículos$", - "^Editor('|s)(s|') [Rr]eply$", - "^Editor(’|s)(s|’) letter$", - "^Redaktoriaus žodis$", - "^DISCUSSION ON THE PRECEDING PAPER$", - "^Kōtō shōgaku shūshinsho jidōyō$", - "^Shōgaku nihon rekishi$", - "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", - "^Préface$", - "^Occupational [Hh]ealth [Ss]ervices.$", - "^In Memoriam Professor Toshiyuki TAKESHIMA$", - "^Účetní závěrka ve vybraném podniku.*$", - "^Financial statements in selected company$", - "^Abdominal [Aa]ortic [Aa]neurysms.*$", - "^Pseudomyxoma peritonei$", - "^Kazalo autora$", - "(?i)^uvodna riječ$", - "^Motivace jako způsob vedení lidí$", - "^Motivation as a leadership$", - "^Polyfunkční dům$", - "^Multi\\-funkcional building$", - "^Podnikatelský plán$", - "(?i)^Podnikatelský záměr$", - "(?i)^Business Plan$", - "^Oceňování nemovitostí$", - "^Marketingová komunikace$", - "^Marketing communication$", - "^Sumario Analítico$", - "^Riječ uredništva$", - "^Savjetovanja i priredbe$", - "^Índice$", - "^(Starobosanski nadpisi).*$", - "^Vzdělávání pracovníků v organizaci$", - "^Staff training in organization$", - "^(Life Histories of North American Geometridae).*$", - "^Strategická analýza podniku$", - "^Strategic Analysis of an Enterprise$", - "^Sadržaj$", - "^Upute suradnicima$", - "^Rodinný dům$", - "(?i)^Fami(l)?ly house$", - "^Upute autorima$", - "^Strategic Analysis$", - "^Finanční analýza vybraného podniku$", - "^Finanční analýza$", - "^Riječ urednika$", - "(?i)^Content(s?)$", - "(?i)^Inhalt$", - "^Jinjō shōgaku shūshinsho jidōyō$", - "(?i)^Index$", - "^Chūgaku kokubun kyōkasho$", - "^Retrato de una mujer$", - "^Retrato de un hombre$", - "^Kōtō shōgaku dokuhon$", - "^Shotōka kokugo$", - "^Shōgaku dokuhon$", - "^Jinjō shōgaku kokugo dokuhon$", - "^Shinsei kokugo dokuhon$", - "^Teikoku dokuhon$", - "^Instructions to Authors$", - "^KİTAP TAHLİLİ$", - "^PRZEGLĄD PIŚMIENNICTWA$", - "(?i)^Presentación$", - "^İçindekiler$", - "(?i)^Tabl?e of contents$", - "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", - "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", - "^Editorial( Board)?$", - "(?i)^Editorial \\(English\\)$", - "^Editörden$", - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*", - "(?i)^.*authors['’′]? reply\\.?$", - "(?i)^.*authors['’′]? response\\.?$" - ] - }, - "synonyms": {} - } -} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 0310a3f44e..fe158d9fcc 100644 --- a/pom.xml +++ b/pom.xml @@ -345,6 +345,22 @@ + + + eu.dnetlib + dnet-actionmanager-common + [6.0.0,7.0.0) + + + commons-httpclient + commons-httpclient + + + eu.dnetlib + dnet-openaireplus-mapping-utils + + + From 679b5869e5f9b2af2ed470d10f3862784049c937 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 18 Mar 2020 17:41:56 +0100 Subject: [PATCH 02/17] implementation of the lookup procedure to take dedup conf from the resource profiles --- .../eu/dnetlib/dedup/SparkCreateSimRels2.java | 104 ++++++++++++------ .../dnetlib/dhp/dedup/dedup_parameters.json | 38 +++---- .../dhp/dedup/oozie_app/DuplicateScanWf.xml | 33 +++--- .../dnetlib/dedup/SparkCreateDedupTest.java | 15 ++- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 1 + 5 files changed, 113 insertions(+), 78 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java index 3fa7be3f78..3892bc2b0c 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java @@ -2,8 +2,13 @@ package eu.dnetlib.dedup; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.actionmanager.actions.AtomicAction; +import eu.dnetlib.actionmanager.common.Agent; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; @@ -17,47 +22,38 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; import scala.Tuple2; -import eu.dnetlib.actionmanager.actions.AtomicAction; -import eu.dnetlib.actionmanager.common.Agent; import java.io.Serializable; -import java.util.Arrays; +import java.io.StringReader; +import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; public class SparkCreateSimRels2 implements Serializable { - final static String CONF_SEPARATOR = "@@@"; - private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class); - public static List decompressConfs(String compressedConfs){ - - return Arrays.stream(compressedConfs.split(CONF_SEPARATOR)) - .map(ArgumentApplicationParser::decompressValue) - .map(DedupConfig::load) - .collect(Collectors.toList()); - } - public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - new SparkCreateSimRels2().run(parser, decompressConfs(parser.get("dedupConf"))); + new SparkCreateSimRels2().run(parser); } - private void run(ArgumentApplicationParser parser, List dedupConfs) { + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { //read oozie parameters - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("targetPath"); - final String rawSetName = parser.get("rawSet"); + final String rawGraphBasePath = parser.get("rawGraphBasePath"); + final String rawSet = parser.get("rawSet"); final String agentId = parser.get("agentId"); final String agentName = parser.get("agentName"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); @@ -66,10 +62,11 @@ public class SparkCreateSimRels2 implements Serializable { JavaRDD> simRel = sc.emptyRDD(); //for each dedup configuration - for (DedupConfig dedupConf: dedupConfs) { + for (DedupConfig dedupConf: getConfigurations(isLookUpUrl, actionSetId)) { final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); - JavaPairRDD mapDocument = sc.textFile(sourcePath + "/" + entity) + JavaPairRDD mapDocument = sc.textFile(rawGraphBasePath + "/" + subEntity) .mapToPair(s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); return new Tuple2<>(d.getIdentifier(), d); @@ -88,25 +85,35 @@ public class SparkCreateSimRels2 implements Serializable { .mapToPair(rel -> new Tuple2<>( createActionId(rel.getSource(), rel.getTarget(), entity), //TODO update the type, maybe take it from the configuration? - new AtomicAction(rawSetName, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes()))) + new AtomicAction(rawSet, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes()))) .map(aa -> new Tuple2<>(aa._1(), transformAction(aa._2()))); simRel = simRel.union(newSimRels); } - String targetDirectory = targetPath + "/" + rawSetName; - -// simRel.map(s -> s._1().toString()).saveAsTextFile(targetDirectory); - simRel.mapToPair(r -> r) - .saveAsHadoopFile(targetDirectory, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); } } - public Text createActionId(String source, String target, String type) { + public Text createActionId(String source, String target, String entity) { + + String type = ""; + + switch(entity){ + case "result": + type = "resultResult_dedupSimilarity_isSimilarTo"; + break; + case "organization": + type = "organizationOrganization_dedupSimilarity_isSimilarTo"; + break; + default: + break; + } + String id = source + "@" + type + "@" + target; return new Text(id); @@ -135,8 +142,43 @@ public class SparkCreateSimRels2 implements Serializable { .appName(SparkCreateSimRels2.class.getSimpleName()) .master(parser.get("master")) .config(conf) -// .enableHiveSupport() + .enableHiveSupport() .getOrCreate(); } + public List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + log.info("loading dedup orchestration: " + xquery); + + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } + + return configurations; + + } + + private DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = + isLookUpService.getResourceProfileByQuery(String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + log.debug("loaded dedup configuration from IS profile: " + conf); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } + } diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json index 9bdddef8a8..1582739d4d 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json @@ -6,33 +6,27 @@ "paramRequired": true }, { - "paramName": "s", - "paramLongName": "sourcePath", + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "address for the LookUp", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "rawGraphBasePath", "paramDescription": "the base path of the raw graph", "paramRequired": true }, { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped (directory in the sourcePath)", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "dedupConf", - "paramDescription": "list of dedup configuration to be used", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "target base path to save dedup result (actions)", - "paramRequired": true - }, - { - "paramName": "rs", + "paramName": "o", "paramLongName": "rawSet", - "paramDescription": "the raw set to be saved (directory in the targetPath)", + "paramDescription": "the raw set to be saved (full path)", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml index 1dede2c70d..5daa12ce55 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml @@ -1,19 +1,11 @@ - sourcePath + rawGraphBasePath the raw graph base path - entity - the entity that should be processed - - - dedupConf - the (list of) dedup Configuration(s) - - - targetPath + actionSetBasePath the output base path @@ -28,6 +20,14 @@ agentName the agent name + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + sparkDriverMemory memory for driver process @@ -72,13 +72,12 @@ spark.sql.warehouse.dir="/user/hive/warehouse" -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - --rawSet${rawSet} - --agentId${agentId} - --agentName${agentName} + --i${rawGraphBasePath} + --o${rawSet} + --ai${agentId} + --an${agentName} + --la${isLookUpUrl} + --asi${actionSetId} diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index 12bba7c1e9..abb00d27cb 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -20,13 +20,11 @@ import java.util.Set; public class SparkCreateDedupTest { String configuration; - String configuration2; - String entity = "publication"; + String entity = "organization"; @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org1.curr.conf.json")); - configuration2 = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org2.curr.conf.json")); + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); } @Test @@ -48,11 +46,12 @@ public class SparkCreateDedupTest { "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, - "-c", ArgumentApplicationParser.compressArgument(configuration) + "@@@" + ArgumentApplicationParser.compressArgument(configuration2), - "-t", "/tmp/dedup", - "-rs", "rawset_test", + "-c", ArgumentApplicationParser.compressArgument(configuration), + "-rs", "/tmp/dedup/rawset_test", "-ai", "agentId", - "-an", "agentName" + "-an", "agentName", + "-asi", "dedup-similarity-result-levenstein", + "-la", "lookupurl", }); } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 31b200c725..726f2b8997 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -3,6 +3,7 @@ "threshold" : "0.99", "dedupRun" : "001", "entityType" : "organization", + "subEntityValue": "organization", "orderField" : "legalname", "queueMaxSize" : "2000", "groupMaxSize" : "50", From 4e82a24af29bec443f30c37cbc0ed56d3051c874 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 19 Mar 2020 15:01:07 +0100 Subject: [PATCH 03/17] minor changes and implementation of the create connected components action --- .../java/eu/dnetlib/dedup/DedupUtility.java | 81 +++++++------ .../dedup/SparkCreateConnectedComponent.java | 9 +- .../dedup/SparkCreateConnectedComponent2.java | 100 +++++++++++++++++ .../eu/dnetlib/dedup/SparkCreateSimRels.java | 4 +- .../eu/dnetlib/dedup/SparkCreateSimRels2.java | 106 ++++++------------ .../dhp/dedup/createCC_parameters.json | 38 +++++++ ...ers.json => createSimRels_parameters.json} | 12 +- .../dhp/dedup/oozie_app/DuplicateScanWf.xml | 26 ++--- 8 files changed, 231 insertions(+), 145 deletions(-) create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{dedup_parameters.json => createSimRels_parameters.json} (77%) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java index 3bed74f862..94a328533b 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -4,6 +4,9 @@ import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; @@ -20,9 +23,14 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; import scala.Tuple2; import java.io.IOException; +import java.io.StringReader; import java.io.StringWriter; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; @@ -54,38 +62,6 @@ public class DedupUtility { return accumulators; } - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } - - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } - - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - - } - - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); } @@ -150,12 +126,12 @@ public class DedupUtility { return String.format("%s/%s", basePath, entityType); } - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s_simRel", basePath, entityType); + public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) { + return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); } - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s_mergeRel", basePath, entityType); + public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } private static Double sim(Author a, Author b) { @@ -216,4 +192,37 @@ public class DedupUtility { return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } + + public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } + + return configurations; + + } + + private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = + isLookUpService.getResourceProfileByQuery(String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 16e112c252..bdfd2c572b 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -1,8 +1,5 @@ package eu.dnetlib.dedup; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dedup.graph.ConnectedComponent; import eu.dnetlib.dedup.graph.GraphProcessor; @@ -29,7 +26,7 @@ import java.util.List; public class SparkCreateConnectedComponent { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); parser.parseArgument(args); final SparkSession spark = SparkSession .builder() @@ -50,7 +47,7 @@ public class SparkCreateConnectedComponent { s -> new Tuple2(getHashcode(s), s) ); - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath, "",entity)).as(Encoders.bean(Relation.class)); final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> @@ -70,7 +67,7 @@ public class SparkCreateConnectedComponent { tmp.add(r); return tmp.stream(); }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity)); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,"",entity)); } public static long getHashcode(final String id) { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java new file mode 100644 index 0000000000..ad3f6efc02 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dedup; + +import com.google.common.hash.Hashing; +import eu.dnetlib.dedup.graph.ConnectedComponent; +import eu.dnetlib.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.graphx.Edge; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.List; + +public class SparkCreateConnectedComponent2 { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateConnectedComponent2().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) + s -> new Tuple2(getHashcode(s), s) + ); + + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); + final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); + final Dataset mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction) c -> + c.getDocIds() + .stream() + .flatMap(id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }).iterator()).rdd(), Encoders.bean(Relation.class)); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity)); + } + } + } + + public static long getHashcode(final String id) { + return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels2.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index 831e45daf4..543dae8e9b 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,6 +1,5 @@ package eu.dnetlib.dedup; -import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.pace.config.DedupConfig; @@ -10,7 +9,6 @@ import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import scala.Tuple2; @@ -29,7 +27,7 @@ import java.util.List; public class SparkCreateSimRels { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); parser.parseArgument(args); final SparkSession spark = SparkSession .builder() diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java index 3892bc2b0c..4f5458a24a 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java @@ -22,6 +22,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -39,7 +40,7 @@ public class SparkCreateSimRels2 implements Serializable { private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class); public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); parser.parseArgument(args); new SparkCreateSimRels2().run(parser); @@ -48,12 +49,11 @@ public class SparkCreateSimRels2 implements Serializable { private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { //read oozie parameters - final String rawGraphBasePath = parser.get("rawGraphBasePath"); + final String graphBasePath = parser.get("graphBasePath"); final String rawSet = parser.get("rawSet"); - final String agentId = parser.get("agentId"); - final String agentName = parser.get("agentName"); final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); @@ -62,11 +62,11 @@ public class SparkCreateSimRels2 implements Serializable { JavaRDD> simRel = sc.emptyRDD(); //for each dedup configuration - for (DedupConfig dedupConf: getConfigurations(isLookUpUrl, actionSetId)) { + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { final String entity = dedupConf.getWf().getEntityType(); final String subEntity = dedupConf.getWf().getSubEntityValue(); - JavaPairRDD mapDocument = sc.textFile(rawGraphBasePath + "/" + subEntity) + JavaPairRDD mapDocument = sc.textFile(graphBasePath + "/" + subEntity) .mapToPair(s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); return new Tuple2<>(d.getIdentifier(), d); @@ -78,59 +78,54 @@ public class SparkCreateSimRels2 implements Serializable { //create relations by comparing only elements in the same group final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2())); + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); + + //save the simrel in the workingdir + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); //create atomic actions JavaRDD> newSimRels = relationsRDD - .mapToPair(rel -> - new Tuple2<>( - createActionId(rel.getSource(), rel.getTarget(), entity), //TODO update the type, maybe take it from the configuration? - new AtomicAction(rawSet, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes()))) - .map(aa -> new Tuple2<>(aa._1(), transformAction(aa._2()))); + .map(this::createSequenceFileRow); simRel = simRel.union(newSimRels); - } simRel.mapToPair(r -> r) .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - } } - public Text createActionId(String source, String target, String entity) { - - String type = ""; - - switch(entity){ - case "result": - type = "resultResult_dedupSimilarity_isSimilarTo"; - break; - case "organization": - type = "organizationOrganization_dedupSimilarity_isSimilarTo"; - break; - default: - break; - } - - String id = source + "@" + type + "@" + target; - - return new Text(id); - } - - public Text transformAction(AtomicAction aa) throws JsonProcessingException { + public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { ObjectMapper mapper = new ObjectMapper(); - return new Text(mapper.writeValueAsString(aa)); + String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget(); + //TODO do be replaced by the new implementation of AtomicAction + AtomicAction aa = new AtomicAction("rawSet", new Agent("agentId", "agentName", Agent.AGENT_TYPE.service), relation.getSource(), relation.getRelClass(), relation.getTarget(), new ObjectMapper().writeValueAsString(relation).getBytes()); + + return new Tuple2<>( + new Text(id), + new Text(mapper.writeValueAsString(aa)) + ); } - public Relation createSimRel(String source, String target){ + public Relation createSimRel(String source, String target, String entity){ final Relation r = new Relation(); r.setSource(source); r.setTarget(target); - r.setRelClass("isSimilarTo"); + + switch(entity){ + case "result": + r.setRelClass("resultResult_dedupSimilarity_isSimilarTo"); + break; + case "organization": + r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo"); + break; + default: + r.setRelClass("isSimilarTo"); + break; + } return r; } @@ -146,39 +141,4 @@ public class SparkCreateSimRels2 implements Serializable { .getOrCreate(); } - public List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); - - final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); - log.info("loading dedup orchestration: " + xquery); - - String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); - - final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); - - final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); - final List configurations = new ArrayList<>(); - - for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { - configurations.add(loadConfig(isLookUpService, actionSetId, o)); - } - - return configurations; - - } - - private DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) - throws ISLookUpException { - final Element s = (Element) o; - final String configProfileId = s.attributeValue("id"); - final String conf = - isLookUpService.getResourceProfileByQuery(String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); - log.debug("loaded dedup configuration from IS profile: " + conf); - final DedupConfig dedupConfig = DedupConfig.load(conf); - dedupConfig.getWf().setConfigurationId(actionSetId); - return dedupConfig; - } - } diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json new file mode 100644 index 0000000000..bcd2ff974c --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "rawSet", + "paramDescription": "the raw set to be saved (full path)", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url for the lookup service", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path for the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json similarity index 77% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json index 1582739d4d..83a0301593 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -30,15 +30,9 @@ "paramRequired": true }, { - "paramName": "ai", - "paramLongName": "agentId", - "paramDescription": "the agent identifier", - "paramRequired": true - }, - { - "paramName": "an", - "paramLongName": "agentName", - "paramDescription": "the agent name", + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml index 5daa12ce55..5ab6c9e47f 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml @@ -1,25 +1,13 @@ - rawGraphBasePath + graphBasePath the raw graph base path - - actionSetBasePath - the output base path - rawSet the output directory in the targetPath - - agentId - the agent identifier - - - agentName - the agent name - isLookUpUrl the address of the lookUp service @@ -28,6 +16,10 @@ actionSetId id of the actionSet + + workingPath + path for the working directory + sparkDriverMemory memory for driver process @@ -42,15 +34,15 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + - + @@ -74,8 +66,6 @@ -mtyarn-cluster --i${rawGraphBasePath} --o${rawSet} - --ai${agentId} - --an${agentName} --la${isLookUpUrl} --asi${actionSetId} From 6e0fb8efa0d26c99f1e2414915c2392c631b9e7e Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 19 Mar 2020 15:08:03 +0100 Subject: [PATCH 04/17] minor changes --- .../main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java | 4 ++++ .../dnetlib/dhp/schema/action/AtomicActionDeserializer.java | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java create mode 100644 dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java new file mode 100644 index 0000000000..74bb3bc7b7 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -0,0 +1,4 @@ +package eu.dnetlib.dhp.schema.action; + +public class AtomicAction { +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java new file mode 100644 index 0000000000..86fe2c4211 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -0,0 +1,4 @@ +package eu.dnetlib.dhp.schema.action; + +public class AtomicActionDeserializer { +} From 6d879e2ee18b39638aef80fe19526991be1d3390 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 19 Mar 2020 15:10:42 +0100 Subject: [PATCH 05/17] integration of the new AtomicAction class --- .../dhp/schema/action/AtomicAction.java | 38 ++++++++++++++++++- .../action/AtomicActionDeserializer.java | 29 +++++++++++++- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java index 74bb3bc7b7..f42d431a99 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -1,4 +1,38 @@ package eu.dnetlib.dhp.schema.action; -public class AtomicAction { -} +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +import java.io.Serializable; + +@JsonDeserialize(using = AtomicActionDeserializer.class) +public class AtomicAction implements Serializable { + + private Class clazz; + + private T payload; + + public AtomicAction() { + } + + public AtomicAction(Class clazz, T payload) { + this.clazz = clazz; + this.payload = payload; + } + + public Class getClazz() { + return clazz; + } + + public void setClazz(Class clazz) { + this.clazz = clazz; + } + + public T getPayload() { + return payload; + } + + public void setPayload(T payload) { + this.payload = payload; + } +} \ No newline at end of file diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java index 86fe2c4211..c09c264d43 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -1,4 +1,29 @@ package eu.dnetlib.dhp.schema.action; -public class AtomicActionDeserializer { -} +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +import java.io.IOException; + +public class AtomicActionDeserializer extends JsonDeserializer { + + @Override + public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException { + JsonNode node = jp.getCodec().readTree(jp); + String classTag = node.get("clazz").asText(); + JsonNode payload = node.get("payload"); + ObjectMapper mapper = new ObjectMapper(); + + try { + final Class clazz = Class.forName(classTag); + return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } +} \ No newline at end of file From e16e644faff7d30222dbfd358421941a6e56e411 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 20 Mar 2020 13:01:56 +0100 Subject: [PATCH 06/17] implementation of the workflow for entity update and for relations update --- .../java/eu/dnetlib/dedup/DedupUtility.java | 4 + .../dedup/SparkCreateConnectedComponent.java | 100 +++++++---- .../dedup/SparkCreateConnectedComponent2.java | 100 ----------- .../dnetlib/dedup/SparkCreateDedupRecord.java | 56 ++++-- .../eu/dnetlib/dedup/SparkCreateSimRels.java | 152 +++++++++++----- .../eu/dnetlib/dedup/SparkCreateSimRels2.java | 144 --------------- .../dnetlib/dedup/SparkPropagateRelation.java | 169 ++++++++++++++++++ .../eu/dnetlib/dedup/SparkUpdateEntity.java | 121 +++++++++++++ .../dedup/createDedupRecord_parameters.json | 32 ++++ .../dedup/oozie_app/BuildRootRecordsWf.xml | 129 +++++++++++++ .../dhp/dedup/oozie_app/DuplicateScanWf.xml | 5 +- .../dhp/dedup/oozie_app/UpdateRelationsWf.xml | 68 +++++++ .../dedup/propagateRelation_parameters.json | 26 +++ .../dhp/dedup/updateEntity_parameters.json | 38 ++++ .../dnetlib/dedup/SparkCreateDedupTest.java | 31 ++-- 15 files changed, 809 insertions(+), 366 deletions(-) delete mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java delete mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java index 94a328533b..ca390743e1 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -122,6 +122,10 @@ public class DedupUtility { }); } + public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); + } + public static String createEntityPath(final String basePath, final String entityType) { return String.format("%s/%s", basePath, entityType); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index bdfd2c572b..411913cdf0 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -5,9 +5,11 @@ import eu.dnetlib.dedup.graph.ConnectedComponent; import eu.dnetlib.dedup.graph.GraphProcessor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -18,6 +20,7 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; import scala.Tuple2; import java.util.ArrayList; @@ -26,51 +29,72 @@ import java.util.List; public class SparkCreateConnectedComponent { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + new SparkCreateConnectedComponent().run(parser); + } - final JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) - s -> new Tuple2(getHashcode(s), s) - ); + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath, "",entity)).as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); - final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> - c.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,"",entity)); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) + s -> new Tuple2(getHashcode(s), s) + ); + + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); + final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); + final Dataset mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction) c -> + c.getDocIds() + .stream() + .flatMap(id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }).iterator()).rdd(), Encoders.bean(Relation.class)); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity)); + } + } } public static long getHashcode(final String id) { return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java deleted file mode 100644 index ad3f6efc02..0000000000 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent2.java +++ /dev/null @@ -1,100 +0,0 @@ -package eu.dnetlib.dedup; - -import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; -import scala.Tuple2; - -import java.util.ArrayList; -import java.util.List; - -public class SparkCreateConnectedComponent2 { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json"))); - parser.parseArgument(args); - - new SparkCreateConnectedComponent2().run(parser); - } - - private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { - - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - - try (SparkSession spark = getSparkSession(parser)) { - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - - for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { - final String entity = dedupConf.getWf().getEntityType(); - final String subEntity = dedupConf.getWf().getSubEntityValue(); - - final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) - s -> new Tuple2(getHashcode(s), s) - ); - - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); - final Dataset mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction) c -> - c.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity)); - } - } - } - - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); - } - - private static SparkSession getSparkSession(ArgumentApplicationParser parser) { - SparkConf conf = new SparkConf(); - - return SparkSession - .builder() - .appName(SparkCreateSimRels2.class.getSimpleName()) - .master(parser.get("master")) - .config(conf) - .enableHiveSupport() - .getOrCreate(); - } -} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index db23065263..77c8e04e99 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -3,37 +3,57 @@ package eu.dnetlib.dedup; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.pace.config.DedupConfig; import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; public class SparkCreateDedupRecord { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession + + new SparkCreateDedupRecord().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + String subEntity = dedupConf.getWf().getSubEntityValue(); + + final JavaRDD dedupRecord = + DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity), DedupUtility.createEntityPath(graphBasePath, subEntity), OafEntityType.valueOf(subEntity), dedupConf); + dedupRecord.map(r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }).saveAsTextFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity)); + } + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession .builder() .appName(SparkCreateDedupRecord.class.getSimpleName()) .master(parser.get("master")) + .config(conf) + .enableHiveSupport() .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaRDD dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); - dedupRecord.map(r-> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json"); - - } - } + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index 543dae8e9b..4f25d620bf 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,71 +1,135 @@ package eu.dnetlib.dedup; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; import scala.Tuple2; +import java.io.Serializable; import java.util.List; +public class SparkCreateSimRels implements Serializable { -/** - * This Spark class creates similarity relations between entities, saving result - * - * param request: - * sourcePath - * entityType - * target Path - */ -public class SparkCreateSimRels { + private static final Log log = LogFactory.getLog(SparkCreateSimRels.class); public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession + + new SparkCreateSimRels().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + //read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String rawSet = parser.get("rawSet"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //create empty sequenceFile for the accumulation + JavaRDD> simRel = sc.emptyRDD(); + + //for each dedup configuration + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + JavaPairRDD mapDocument = sc.textFile(graphBasePath + "/" + subEntity) + .mapToPair(s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + //create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + + //create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); + + //save the simrel in the workingdir + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); + + //create atomic actions + JavaRDD> newSimRels = relationsRDD + .map(this::createSequenceFileRow); + + simRel = simRel.union(newSimRels); + } + + simRel.mapToPair(r -> r) + .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + } + + } + + public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { + + ObjectMapper mapper = new ObjectMapper(); + + String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget(); + AtomicAction aa = new AtomicAction<>(Relation.class, relation); + + return new Tuple2<>( + new Text(id), + new Text(mapper.writeValueAsString(aa)) + ); + } + + public Relation createSimRel(String source, String target, String entity){ + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + + switch(entity){ + case "result": + r.setRelClass("resultResult_dedupSimilarity_isSimilarTo"); + break; + case "organization": + r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo"); + break; + default: + r.setRelClass("isSimilarTo"); + break; + } + return r; + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession .builder() .appName(SparkCreateSimRels.class.getSimpleName()) .master(parser.get("master")) + .config(conf) + .enableHiveSupport() .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final long total = sc.textFile(inputPath + "/" + entity).count(); - - JavaPairRDD mapDocument = sc.textFile(inputPath + "/" + entity) - .mapToPair(s->{ - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s); - return new Tuple2<>(d.getIdentifier(), d);}); - - //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); -// JavaPairRDD> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf); - - //create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); -// final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, dedupConf); - - final JavaRDD isSimilarToRDD = dedupRels.map(simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); - - spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); - } -} \ No newline at end of file + +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java deleted file mode 100644 index 4f5458a24a..0000000000 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java +++ /dev/null @@ -1,144 +0,0 @@ -package eu.dnetlib.dedup; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.actionmanager.actions.AtomicAction; -import eu.dnetlib.actionmanager.common.Agent; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import scala.Tuple2; - -import java.io.Serializable; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; - -public class SparkCreateSimRels2 implements Serializable { - - private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class); - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); - parser.parseArgument(args); - - new SparkCreateSimRels2().run(parser); - } - - private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { - - //read oozie parameters - final String graphBasePath = parser.get("graphBasePath"); - final String rawSet = parser.get("rawSet"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); - - try (SparkSession spark = getSparkSession(parser)) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - //create empty sequenceFile for the accumulation - JavaRDD> simRel = sc.emptyRDD(); - - //for each dedup configuration - for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { - final String entity = dedupConf.getWf().getEntityType(); - final String subEntity = dedupConf.getWf().getSubEntityValue(); - - JavaPairRDD mapDocument = sc.textFile(graphBasePath + "/" + subEntity) - .mapToPair(s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); - - //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); - - //create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - - JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); - - //save the simrel in the workingdir - spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); - - //create atomic actions - JavaRDD> newSimRels = relationsRDD - .map(this::createSequenceFileRow); - - simRel = simRel.union(newSimRels); - } - - simRel.mapToPair(r -> r) - .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - } - - } - - public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { - - ObjectMapper mapper = new ObjectMapper(); - - String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget(); - //TODO do be replaced by the new implementation of AtomicAction - AtomicAction aa = new AtomicAction("rawSet", new Agent("agentId", "agentName", Agent.AGENT_TYPE.service), relation.getSource(), relation.getRelClass(), relation.getTarget(), new ObjectMapper().writeValueAsString(relation).getBytes()); - - return new Tuple2<>( - new Text(id), - new Text(mapper.writeValueAsString(aa)) - ); - } - - public Relation createSimRel(String source, String target, String entity){ - final Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - - switch(entity){ - case "result": - r.setRelClass("resultResult_dedupSimilarity_isSimilarTo"); - break; - case "organization": - r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo"); - break; - default: - r.setRelClass("isSimilarTo"); - break; - } - return r; - } - - private static SparkSession getSparkSession(ArgumentApplicationParser parser) { - SparkConf conf = new SparkConf(); - - return SparkSession - .builder() - .appName(SparkCreateSimRels2.class.getSimpleName()) - .master(parser.get("master")) - .config(conf) - .enableHiveSupport() - .getOrCreate(); - } - -} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java new file mode 100644 index 0000000000..12d9f31b33 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java @@ -0,0 +1,169 @@ +package eu.dnetlib.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; + +public class SparkPropagateRelation { + + enum FieldType { + SOURCE, + TARGET + } + + final static String SOURCEJSONPATH = "$.source"; + final static String TARGETJSONPATH = "$.target"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json"))); + parser.parseArgument(args); + + new SparkPropagateRelation().run(parser); + } + + public void run(ArgumentApplicationParser parser) { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + final Dataset mergeRels = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", "*")).as(Encoders.bean(Relation.class)); + + final JavaPairRDD mergedIds = mergeRels + .where("relClass == 'merges'") + .select(mergeRels.col("source"), mergeRels.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(1), r.getString(0))); + + JavaRDD relations = sc.textFile(DedupUtility.createEntityPath(graphBasePath, "relation")); + + JavaRDD newRels = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.SOURCE); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.TARGET); + } + return v1._2()._1(); + }).filter(SparkPropagateRelation::containsDedup) + .repartition(500); + + //update deleted by inference + relations = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .repartition(500); + + newRels.union(relations).repartition(1000) + .saveAsTextFile(DedupUtility.createEntityPath(dedupGraphPath, "relation"), GzipCodec.class); + } + } + + private static boolean containsDedup(final String json) { + final String source = MapDocumentUtil.getJPathString(SOURCEJSONPATH, json); + final String target = MapDocumentUtil.getJPathString(TARGETJSONPATH, json); + + return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); + } + + private static String replaceField(final String json, final String id, final FieldType type) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Relation relation = mapper.readValue(json, Relation.class); + if (relation.getDataInfo() == null) + relation.setDataInfo(new DataInfo()); + relation.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + relation.setSource(id); + return mapper.writeValueAsString(relation); + case TARGET: + relation.setTarget(id); + return mapper.writeValueAsString(relation); + default: + throw new IllegalArgumentException(""); + } + } catch (IOException e) { + throw new RuntimeException("unable to deserialize json relation: " + json, e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkPropagateRelation.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java new file mode 100644 index 0000000000..3fde1bdae2 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java @@ -0,0 +1,121 @@ +package eu.dnetlib.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.io.IOException; + +public class SparkUpdateEntity { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json"))); + parser.parseArgument(args); + + new SparkUpdateEntity().run(parser); + } + + public void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf : DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + + String subEntity = dedupConf.getWf().getSubEntityValue(); + + final Dataset df = spark.read().load(DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = df + .where("relClass == 'merges'") + .select(df.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + + final JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)); + + final JavaRDD dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity)); + + JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s), s)); + + Class mainClass; + switch (subEntity) { + case "publication": + mainClass = Publication.class; + break; + case "dataset": + mainClass = eu.dnetlib.dhp.schema.oaf.Dataset.class; + break; + case "datasource": + mainClass = Datasource.class; + break; + case "software": + mainClass = Software.class; + break; + case "organization": + mainClass = Organization.class; + break; + case "otherresearchproduct": + mainClass = OtherResearchProduct.class; + break; + default: + throw new IllegalArgumentException("Illegal type " + subEntity); + } + + JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1()); + map.union(dedupEntity).saveAsTextFile(dedupGraphPath + "/" + subEntity, GzipCodec.class); + } + } + + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkUpdateEntity.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json new file mode 100644 index 0000000000..f7bf5e518a --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url of the lookup service", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "the id of the actionset (orchestrator)", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml new file mode 100644 index 0000000000..477e987916 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml @@ -0,0 +1,129 @@ + + + + graphBasePath + the raw graph base path + + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + + + workingPath + path of the working directory + + + dedupGraphPath + path of the dedup graph + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Merge Relations + eu.dnetlib.dedup.SparkCreateConnectedComponent + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Dedup Record + eu.dnetlib.dedup.SparkCreateDedupRecord + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Dedup Record + eu.dnetlib.dedup.SparkUpdateEntity + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + --o${dedupGraphPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml index 5ab6c9e47f..a685db1e88 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml @@ -55,7 +55,7 @@ yarn-cluster cluster Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels2 + eu.dnetlib.dedup.SparkCreateSimRels dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf @@ -64,10 +64,11 @@ spark.sql.warehouse.dir="/user/hive/warehouse" -mtyarn-cluster - --i${rawGraphBasePath} + --i${graphBasePath} --o${rawSet} --la${isLookUpUrl} --asi${actionSetId} + --w${workingPath} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml new file mode 100644 index 0000000000..c4b17860ea --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml @@ -0,0 +1,68 @@ + + + + graphBasePath + the raw graph base path + + + workingPath + path for the working directory + + + dedupGraphPath + path of the dedup graph + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Update Relations + eu.dnetlib.dedup.SparkPropagateRelation + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --o${dedupGraphPath} + --w${workingPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json new file mode 100644 index 0000000000..721a783e12 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json new file mode 100644 index 0000000000..76aea05374 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json @@ -0,0 +1,38 @@ +[ +{ + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true +}, +{ + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true +}, +{ + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true +}, +{ + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescriptions": "the url of the lookup service", + "paramRequired": true +}, +{ + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescriptions": "the id of the actionset (orchestrator)", + "paramRequired": true +}, + { + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index abb00d27cb..09f8a0fd6a 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -1,21 +1,14 @@ package eu.dnetlib.dedup; -import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Publication; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; -import java.io.File; import java.io.IOException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; public class SparkCreateDedupTest { @@ -27,22 +20,10 @@ public class SparkCreateDedupTest { configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); } - @Test - @Ignore - public void createSimRelsTest() throws Exception { - SparkCreateSimRels.main(new String[] { - "-mt", "local[*]", - "-s", "/Users/miconis/dumps", - "-e", entity, - "-c", ArgumentApplicationParser.compressArgument(configuration), - "-t", "/tmp/dedup", - }); - } - @Test @Ignore public void createSimRelsTest2() throws Exception { - SparkCreateSimRels2.main(new String[] { + SparkCreateSimRels.main(new String[] { "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -98,4 +79,14 @@ public class SparkCreateDedupTest { System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); } + @Test + public void testJoinEntities() throws Exception{ + SparkJoinEntities.main(new String[] { + "-mt", "local[*]", + "-i", "/tmp/dedup", + "-w", "/tmp/dedup", + "-o", "/tmp/dedup", + }); + } + } From 6cb0a9bff07bc1404c4ff46815dc01ac5a7f66f6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Mar 2020 16:48:14 +0100 Subject: [PATCH 07/17] dedup wf directory structure aligned with project commons --- dhp-workflows/dhp-dedup/pom.xml | 4 - .../dnetlib/{ => dhp}/dedup/DatePicker.java | 2 +- .../{ => dhp}/dedup/DedupRecordFactory.java | 5 +- .../dnetlib/{ => dhp}/dedup/DedupUtility.java | 13 +- .../eu/dnetlib/{ => dhp}/dedup/Deduper.java | 3 +- .../{ => dhp}/dedup/OafEntityType.java | 2 +- .../dedup/SparkCreateConnectedComponent.java | 8 +- .../dedup/SparkCreateDedupRecord.java | 2 +- .../{ => dhp}/dedup/SparkCreateSimRels.java | 8 +- .../dedup/SparkPropagateRelation.java | 2 +- .../{ => dhp}/dedup/SparkReporter.java | 2 +- .../{ => dhp}/dedup/SparkUpdateEntity.java | 2 +- .../dedup/graph/ConnectedComponent.java | 4 +- .../dedup/graph/GraphProcessor.scala | 2 +- .../dnetlib/dhp/dedup/oozie_app/workflow.xml | 126 ---------- .../oozie_app/config-default.xml | 8 - .../oozie_app/workflow.xml} | 2 +- .../dedup/roots/oozie_app/config-default.xml | 18 ++ .../oozie_app/workflow.xml} | 6 +- .../dedup/scan/oozie_app/config-default.xml | 18 ++ .../oozie_app/workflow.xml} | 34 ++- .../{ => dhp}/dedup/MergeAuthorTest.java | 2 +- .../{ => dhp}/dedup/SparkCreateDedupTest.java | 16 +- .../{ => dhp}/dedup/jpath/JsonPathTest.java | 2 +- .../{ => dhp}/dedup/conf/org.curr.conf.json | 0 .../{ => dhp}/dedup/conf/pub.curr.conf.json | 0 .../dnetlib/{ => dhp}/dedup/conf/sample.json | 0 .../{ => dhp}/dedup/json/authors_merge.json | 0 .../job-override.properties | 10 +- pom.xml | 225 +++++++++--------- 30 files changed, 204 insertions(+), 322 deletions(-) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DatePicker.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DedupRecordFactory.java (98%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DedupUtility.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/Deduper.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/OafEntityType.java (83%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateConnectedComponent.java (96%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateDedupRecord.java (98%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateSimRels.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkPropagateRelation.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkReporter.java (97%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkUpdateEntity.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/graph/ConnectedComponent.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/graph/GraphProcessor.scala (96%) delete mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{ => relations}/oozie_app/config-default.xml (62%) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/UpdateRelationsWf.xml => relations/oozie_app/workflow.xml} (96%) create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/BuildRootRecordsWf.xml => roots/oozie_app/workflow.xml} (95%) create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/DuplicateScanWf.xml => scan/oozie_app/workflow.xml} (68%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/MergeAuthorTest.java (97%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateDedupTest.java (84%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/jpath/JsonPathTest.java (95%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/org.curr.conf.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/pub.curr.conf.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/sample.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/json/authors_merge.json (100%) diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index cc27952fa4..f39bf62f0d 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -82,10 +82,6 @@ com.fasterxml.jackson.core jackson-core - - eu.dnetlib - dnet-actionmanager-common - diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java index 73f178edce..bd5c1118e9 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Field; import org.apache.commons.lang.StringUtils; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java similarity index 98% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java index 5f81669e95..2fcac45fad 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java @@ -1,11 +1,9 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.lang.NotImplementedException; -import org.apache.commons.lang.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper; import scala.Tuple2; import java.util.Collection; -import java.util.Random; import static java.util.stream.Collectors.toMap; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java index ca390743e1..3d505888a1 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; @@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -29,15 +22,11 @@ import org.dom4j.Element; import org.dom4j.io.SAXReader; import scala.Tuple2; -import java.io.IOException; import java.io.StringReader; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; public class DedupUtility { private static final Double THRESHOLD = 0.95; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java index 7206f892f7..dda71fbcf7 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java similarity index 83% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java index fb347ed516..66f0b3ce67 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; public enum OafEntityType { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java index 411913cdf0..75b1dd01c2 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java @@ -1,8 +1,8 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.dedup.graph.GraphProcessor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent { } public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); + return Hashing.murmur3_128().hashString(id).asLong(); } private static SparkSession getSparkSession(ArgumentApplicationParser parser) { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java similarity index 98% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java index 77c8e04e99..0ce12d10a7 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java index 4f25d620bf..8298f98679 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; @@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable { JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); //save the simrel in the workingdir - spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); //create atomic actions JavaRDD> newSimRels = relationsRDD @@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable { .appName(SparkCreateSimRels.class.getSimpleName()) .master(parser.get("master")) .config(conf) - .enableHiveSupport() .getOrCreate(); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java index 12d9f31b33..5c7be2817e 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java index 165a10b25a..c83a66e700 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.util.Reporter; import org.apache.commons.logging.Log; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java index 3fde1bdae2..0c9890b03e 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java index 27a61c02d7..dd1a370c5c 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java @@ -1,7 +1,7 @@ -package eu.dnetlib.dedup.graph; +package eu.dnetlib.dhp.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.dhp.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala index 38c6951528..80b0b9ef4c 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.graph +package eu.dnetlib.dhp.dedup.graph import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml deleted file mode 100644 index 5a00a59676..0000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - sourcePath - the source path - - - entity - the entity that should be processed - - - dedupConf - the dedup Configuration - - - targetPath - the target path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Connected Components - eu.dnetlib.dedup.SparkCreateConnectedComponent - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --dedupPath${dedupPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml similarity index 62% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml index fcab9dd00a..2e0ed9aeea 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml @@ -15,12 +15,4 @@ oozie.action.sharelib.for.spark spark2 - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hive_db_name - openaire - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml index c4b17860ea..749af6ecbe 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml @@ -47,7 +47,7 @@ yarn-cluster cluster Update Relations - eu.dnetlib.dedup.SparkPropagateRelation + eu.dnetlib.dhp.dedup.SparkPropagateRelation dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml index 477e987916..457e62818f 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml @@ -56,7 +56,7 @@ yarn-cluster cluster Create Merge Relations - eu.dnetlib.dedup.SparkCreateConnectedComponent + eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf @@ -81,7 +81,7 @@ yarn-cluster cluster Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord + eu.dnetlib.dhp.dedup.SparkCreateDedupRecord dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf @@ -106,7 +106,7 @@ yarn-cluster cluster Create Dedup Record - eu.dnetlib.dedup.SparkUpdateEntity + eu.dnetlib.dhp.dedup.SparkUpdateEntity dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml similarity index 68% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index a685db1e88..01498ce040 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -34,6 +34,21 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + @@ -50,20 +65,19 @@ - ${jobTracker} - ${nameNode} - yarn-cluster + yarn cluster Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels + eu.dnetlib.dhp.dedup.SparkCreateSimRels dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --o${rawSet} --la${isLookUpUrl} diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java index 817f2075c3..e8bfd08fd5 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.IOUtils; diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java similarity index 84% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index 09f8a0fd6a..47e446e7a8 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; @@ -74,19 +74,9 @@ public class SparkCreateDedupTest { final HashFunction hashFunction = Hashing.murmur3_128(); System.out.println( s1.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); + System.out.println(hashFunction.hashString(s1).asLong()); System.out.println( s2.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); - } - - @Test - public void testJoinEntities() throws Exception{ - SparkJoinEntities.main(new String[] { - "-mt", "local[*]", - "-i", "/tmp/dedup", - "-w", "/tmp/dedup", - "-o", "/tmp/dedup", - }); + System.out.println(hashFunction.hashString(s2).asLong()); } } diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java index 7a63cfe24e..8a88896fc1 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.jpath; +package eu.dnetlib.dhp.dedup.jpath; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 68816c2240..8230dfc18f 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,12 +1,14 @@ -sparkDriverMemory=10G -sparkExecutorMemory=15G +sparkExecutorCoresForJoining=1 +sparkDriverMemoryForJoining=10G +sparkExecutorMemoryForJoining=15G +sparkExecutorCoresForIndexing=64 +sparkDriverMemoryForIndexing=3G +sparkExecutorMemoryForIndexing=2G #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 outputPath=/tmp/openaire_provision format=TMF batchSize=2000 -sparkExecutorCoresForJoining=128 -sparkExecutorCoresForIndexing=64 reuseRecords=false otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource \ No newline at end of file diff --git a/pom.xml b/pom.xml index fe158d9fcc..1ae078128e 100644 --- a/pom.xml +++ b/pom.xml @@ -1,6 +1,6 @@ + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 eu.dnetlib.dhp @@ -101,12 +101,12 @@ org.apache.hadoop hadoop-common - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client ${dhp.hadoop.version} provided @@ -148,6 +148,13 @@ ${dhp.commons.lang.version} + + com.google.guava + guava + ${dhp.guava.version} + + + commons-codec commons-codec @@ -167,11 +174,11 @@ provided - - net.sf.saxon - Saxon-HE - 9.9.1-6 - + + net.sf.saxon + Saxon-HE + 9.9.1-6 + dom4j @@ -192,56 +199,56 @@ - com.mycila.xmltool - xmltool - 3.3 - + com.mycila.xmltool + xmltool + 3.3 + - - org.apache.solr - solr-solrj - 7.5.0 - - - * - * - - - - - com.lucidworks.spark - spark-solr - 3.6.0 - - - * - * - - - + + org.apache.solr + solr-solrj + 7.5.0 + + + * + * + + + + + com.lucidworks.spark + spark-solr + 3.6.0 + + + * + * + + + - - org.apache.httpcomponents - httpclient - 4.5.3 - - - org.apache.httpcomponents - httpmime - 4.5.3 - - - org.noggit - noggit - 0.8 - - - org.apache.zookeeper - zookeeper - 3.4.11 - + + org.apache.httpcomponents + httpclient + 4.5.3 + + + org.apache.httpcomponents + httpmime + 4.5.3 + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + - + net.schmizz sshj 0.10.0 @@ -283,17 +290,17 @@ dnet-pace-core 4.0.0 - - eu.dnetlib - cnr-rmi-api - [2.0.0,3.0.0) - + + eu.dnetlib + cnr-rmi-api + [2.0.0,3.0.0) + - - org.apache.cxf - cxf-rt-transports-http - 3.1.5 - + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + javax.persistence javax.persistence-api @@ -301,36 +308,36 @@ provided - - com.rabbitmq - amqp-client - 5.6.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - - - org.mongodb - mongo-java-driver - ${mongodb.driver.version} - - - org.antlr - stringtemplate - 4.0 - + + com.rabbitmq + amqp-client + 5.6.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + + + org.antlr + stringtemplate + 4.0 + org.apache.oozie @@ -345,22 +352,6 @@ - - - eu.dnetlib - dnet-actionmanager-common - [6.0.0,7.0.0) - - - commons-httpclient - commons-httpclient - - - eu.dnetlib - dnet-openaireplus-mapping-utils - - - @@ -512,9 +503,9 @@ 2.4.0.cloudera2 2.9.6 3.5 + 11.0.2 2.11.12 4.12 3.4.2 - From a4c52661a01f8534da611804ec652ac7cf9dc4e5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Mar 2020 19:17:24 +0100 Subject: [PATCH 08/17] WIP: fixing dedup workflows --- dhp-workflows/dhp-dedup/pom.xml | 9 +++++++++ .../java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java | 2 -- .../eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java | 5 ++++- .../java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java | 8 +++++++- .../eu/dnetlib/dhp/dedup/createSimRels_parameters.json | 2 +- .../eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml | 4 ++++ 6 files changed, 25 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index f39bf62f0d..691fbe6d56 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -65,6 +65,15 @@ com.arakelian java-jq + + dom4j + dom4j + + + jaxen + jaxen + + eu.dnetlib diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java index 2fcac45fad..583e90ab94 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java @@ -15,8 +15,6 @@ import scala.Tuple2; import java.util.Collection; -import static java.util.stream.Collectors.toMap; - public class DedupRecordFactory { public static JavaRDD createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java index 0ce12d10a7..51d0760e04 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java @@ -34,8 +34,11 @@ public class SparkCreateDedupRecord { for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { String subEntity = dedupConf.getWf().getSubEntityValue(); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); + final OafEntityType entityType = OafEntityType.valueOf(subEntity); final JavaRDD dedupRecord = - DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity), DedupUtility.createEntityPath(graphBasePath, subEntity), OafEntityType.valueOf(subEntity), dedupConf); + DedupRecordFactory.createDedupRecord(sc, spark, mergeRelPath, entityPath, entityType, dedupConf); dedupRecord.map(r -> { ObjectMapper mapper = new ObjectMapper(); return mapper.writeValueAsString(r); diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java index 8298f98679..18d0d4ee6d 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -47,6 +47,12 @@ public class SparkCreateSimRels implements Serializable { final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); + System.out.println(String.format("graphBasePath: '%s'", graphBasePath)); + System.out.println(String.format("rawSet: '%s'", rawSet)); + System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl)); + System.out.println(String.format("actionSetId: '%s'", actionSetId)); + System.out.println(String.format("workingPath: '%s'", workingPath)); + try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); @@ -58,7 +64,7 @@ public class SparkCreateSimRels implements Serializable { final String entity = dedupConf.getWf().getEntityType(); final String subEntity = dedupConf.getWf().getSubEntityValue(); - JavaPairRDD mapDocument = sc.textFile(graphBasePath + "/" + subEntity) + JavaPairRDD mapDocument = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) .mapToPair(s -> { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); return new Tuple2<>(d.getIdentifier(), d); diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json index 83a0301593..b8c8af699a 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -19,7 +19,7 @@ }, { "paramName": "i", - "paramLongName": "rawGraphBasePath", + "paramLongName": "graphBasePath", "paramDescription": "the base path of the raw graph", "paramRequired": true }, diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index 01498ce040..35ed281037 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -65,6 +65,10 @@ + + + + yarn cluster Create Similarity Relations From c20e179f5af27d2a0fccdeffffeaea3e5379f5fd Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 23 Mar 2020 11:43:49 +0100 Subject: [PATCH 09/17] structure of the workflows updated --- .../dedup/relations/oozie_app/workflow.xml | 27 ++++++++--------- .../dhp/dedup/roots/oozie_app/workflow.xml | 29 +++++++++---------- .../dhp/dedup/scan/oozie_app/workflow.xml | 10 +------ 3 files changed, 27 insertions(+), 39 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml index 749af6ecbe..5be4810574 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml @@ -26,22 +26,17 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - + + + + ${jobTracker} ${nameNode} yarn-cluster @@ -49,11 +44,13 @@ Update Relations eu.dnetlib.dhp.dedup.SparkPropagateRelation dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" -mtyarn-cluster --i${graphBasePath} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml index 457e62818f..3ef79e04ff 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml @@ -34,23 +34,17 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - + + + ${jobTracker} ${nameNode} yarn-cluster @@ -58,11 +52,13 @@ Create Merge Relations eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" -mtyarn-cluster --i${graphBasePath} @@ -76,6 +72,9 @@ + + + ${jobTracker} ${nameNode} yarn-cluster diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index 35ed281037..c4198a5c53 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -49,20 +49,12 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - From f7890a90dfd46e13902077fd4c01ebdc445b24d9 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 23 Mar 2020 17:13:30 +0100 Subject: [PATCH 10/17] implementation of the mechanism that checks the existance of a mergerel file --- .../dnetlib/dhp/dedup/SparkUpdateEntity.java | 102 ++++++++++-------- .../config-default.xml | 0 .../workflow.xml | 43 +++++++- .../dhp/dedup/roots/oozie_app/workflow.xml | 38 ++----- .../dhp/dedup/updateEntity_parameters.json | 12 --- .../dhp/dedup/SparkCreateDedupTest.java | 36 +++++-- 6 files changed, 137 insertions(+), 94 deletions(-) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{relations/oozie_app => consistency.oozie_app}/config-default.xml (100%) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{relations/oozie_app => consistency.oozie_app}/workflow.xml (56%) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java index 0c9890b03e..dd079e4cdc 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -4,10 +4,10 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; @@ -18,13 +18,14 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; import scala.Tuple2; import java.io.IOException; public class SparkUpdateEntity { + final String IDJSONPATH = "$.id"; + public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json"))); parser.parseArgument(args); @@ -32,65 +33,82 @@ public class SparkUpdateEntity { new SparkUpdateEntity().run(parser); } - public void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + public boolean mergeRelExists(String basePath, String entity) throws IOException { + + boolean result = false; + + FileSystem fileSystem = FileSystem.get(new Configuration()); + + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); + + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) + if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + result = true; + } + + return result; + } + + public void run(ArgumentApplicationParser parser) throws IOException { final String graphBasePath = parser.get("graphBasePath"); final String workingPath = parser.get("workingPath"); final String dedupGraphPath = parser.get("dedupGraphPath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - for (DedupConfig dedupConf : DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + //for each entity + for (OafEntityType entity: OafEntityType.values()) { - String subEntity = dedupConf.getWf().getSubEntityValue(); + JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString())); - final Dataset df = spark.read().load(DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = df - .where("relClass == 'merges'") - .select(df.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + if (mergeRelExists(workingPath, entity.toString())) { - final JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)); + final Dataset rel = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", entity.toString())).as(Encoders.bean(Relation.class)); - final JavaRDD dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity)); + final JavaPairRDD mergedIds = rel + .where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s), s)); + final JavaRDD dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString())); - Class mainClass; - switch (subEntity) { - case "publication": - mainClass = Publication.class; - break; - case "dataset": - mainClass = eu.dnetlib.dhp.schema.oaf.Dataset.class; - break; - case "datasource": - mainClass = Datasource.class; - break; - case "software": - mainClass = Software.class; - break; - case "organization": - mainClass = Organization.class; - break; - case "otherresearchproduct": - mainClass = OtherResearchProduct.class; - break; - default: - throw new IllegalArgumentException("Illegal type " + subEntity); + JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + + JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1()); + sourceEntity = map.union(dedupEntity); } - JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1()); - map.union(dedupEntity).saveAsTextFile(dedupGraphPath + "/" + subEntity, GzipCodec.class); + sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class); + } } + } + public Class getOafClass(OafEntityType className) { + switch (className.toString()) { + case "publication": + return Publication.class; + case "dataset": + return eu.dnetlib.dhp.schema.oaf.Dataset.class; + case "datasource": + return Datasource.class; + case "software": + return Software.class; + case "organization": + return Organization.class; + case "otherresearchproduct": + return OtherResearchProduct.class; + case "project": + return Project.class; + default: + throw new IllegalArgumentException("Illegal type " + className); + } } private static String updateDeletedByInference(final String json, final Class clazz) { diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml similarity index 56% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml index 5be4810574..5728e6ad8d 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml @@ -1,12 +1,20 @@ - + graphBasePath the raw graph base path + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + workingPath - path for the working directory + path of the working directory dedupGraphPath @@ -26,12 +34,41 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Dedup Record + eu.dnetlib.dhp.dedup.SparkUpdateEntity + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --o${dedupGraphPath} + + + + + diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml index 3ef79e04ff..984e8ed48d 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml @@ -82,11 +82,13 @@ Create Dedup Record eu.dnetlib.dhp.dedup.SparkCreateDedupRecord dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" -mtyarn-cluster --i${graphBasePath} @@ -94,32 +96,6 @@ --la${isLookUpUrl} --asi${actionSetId} - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dhp.dedup.SparkUpdateEntity - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --i${graphBasePath} - --w${workingPath} - --la${isLookUpUrl} - --asi${actionSetId} - --o${dedupGraphPath} - diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json index 76aea05374..06b67f7322 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json @@ -18,18 +18,6 @@ "paramRequired": true }, { - "paramName": "la", - "paramLongName": "isLookUpUrl", - "paramDescriptions": "the url of the lookup service", - "paramRequired": true -}, -{ - "paramName": "asi", - "paramLongName": "actionSetId", - "paramDescriptions": "the id of the actionset (orchestrator)", - "paramRequired": true -}, - { "paramName": "o", "paramLongName": "dedupGraphPath", "paramDescription": "the path of the dedup graph", diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index 47e446e7a8..ebc1398671 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -4,6 +4,8 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -17,13 +19,14 @@ public class SparkCreateDedupTest { @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); +// configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); + configuration = ""; } @Test @Ignore public void createSimRelsTest2() throws Exception { - SparkCreateSimRels.main(new String[] { + SparkCreateSimRels.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -40,7 +43,7 @@ public class SparkCreateDedupTest { @Ignore public void createCCTest() throws Exception { - SparkCreateConnectedComponent.main(new String[] { + SparkCreateConnectedComponent.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -52,7 +55,7 @@ public class SparkCreateDedupTest { @Test @Ignore public void dedupRecordTest() throws Exception { - SparkCreateDedupRecord.main(new String[] { + SparkCreateDedupRecord.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -62,21 +65,42 @@ public class SparkCreateDedupTest { } @Test + @Ignore public void printConfiguration() throws Exception { System.out.println(ArgumentApplicationParser.compressArgument(configuration)); } @Test + @Ignore public void testHashCode() { final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f"; final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46"; final HashFunction hashFunction = Hashing.murmur3_128(); - System.out.println( s1.hashCode()); + System.out.println(s1.hashCode()); System.out.println(hashFunction.hashString(s1).asLong()); - System.out.println( s2.hashCode()); + System.out.println(s2.hashCode()); System.out.println(hashFunction.hashString(s2).asLong()); } + @Test + public void fileExistsTest() throws IOException { + + boolean result = false; + + FileSystem fileSystem = FileSystem.get(new Configuration()); + + FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/tmp")); + + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) { + if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio")))) { + System.out.println("fs = " + DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio")); + result = true; + } + } + } + + } } From 93e22912910ad0f5a8dd33508fcb1e0fffb1f5bb Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 23 Mar 2020 17:17:56 +0100 Subject: [PATCH 11/17] minor changes --- .../eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml index 5728e6ad8d..e14fa7c55e 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + graphBasePath From aaedbb1b8b916f7fdbfa3303a44ad465a66832f8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 24 Mar 2020 09:59:28 +0100 Subject: [PATCH 12/17] WIP: dedup workflow, stage 2 --- .../dhp/dedup/roots/oozie_app/workflow.xml | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml index 984e8ed48d..49b3969953 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml @@ -34,6 +34,21 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + @@ -45,8 +60,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster Create Merge Relations @@ -75,8 +88,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster Create Dedup Record From f0d72b76a809a848dee850e61d4bdb2ca2777efb Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 24 Mar 2020 10:51:40 +0100 Subject: [PATCH 13/17] package structure fixed --- .../oozie_app}/config-default.xml | 0 .../oozie_app}/workflow.xml | 0 .../dnetlib/dhp/dedup/jpath/JsonPathTest.java | 288 +++++++++++++++++- 3 files changed, 276 insertions(+), 12 deletions(-) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{consistency.oozie_app => consistency/oozie_app}/config-default.xml (100%) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{consistency.oozie_app => consistency/oozie_app}/workflow.xml (100%) diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/config-default.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml similarity index 100% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency.oozie_app/workflow.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java index 8a88896fc1..b3e8d5656e 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java @@ -3,6 +3,9 @@ package eu.dnetlib.dhp.dedup.jpath; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; import org.apache.commons.io.IOUtils; import org.junit.Test; import java.util.List; @@ -10,22 +13,283 @@ import java.util.Map; public class JsonPathTest { + String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = DedupConfig.load("{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); + @Test public void testJPath () throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/sample.json")); - List> pid = JsonPath.read(json, "$.pid[*]"); -// System.out.println(json); - - pid.forEach(it -> { - try { - System.out.println(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - }); - + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + System.out.println("d = " + d); } } From 8e8b5e8f30a86b16f90209af94e0dff770fe0fac Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 24 Mar 2020 17:40:58 +0100 Subject: [PATCH 14/17] roots wf merged in scan wf --- .../dnetlib/dhp/dedup/SparkCreateSimRels.java | 17 +-- .../dnetlib/dhp/dedup/SparkUpdateEntity.java | 4 +- .../dedup/consistency/oozie_app/workflow.xml | 29 +++-- .../dhp/dedup/createCC_parameters.json | 6 - .../dhp/dedup/createSimRels_parameters.json | 2 +- .../dedup/roots/oozie_app/config-default.xml | 18 --- .../dhp/dedup/roots/oozie_app/workflow.xml | 115 ------------------ .../dhp/dedup/scan/oozie_app/workflow.xml | 63 +++++++++- .../dhp/dedup/SparkCreateDedupTest.java | 11 +- 9 files changed, 93 insertions(+), 172 deletions(-) delete mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java index 18d0d4ee6d..8c0efdcad7 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -42,14 +42,14 @@ public class SparkCreateSimRels implements Serializable { //read oozie parameters final String graphBasePath = parser.get("graphBasePath"); - final String rawSet = parser.get("rawSet"); final String isLookUpUrl = parser.get("isLookUpUrl"); + final String rawSet = parser.get("rawSet"); final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); System.out.println(String.format("graphBasePath: '%s'", graphBasePath)); - System.out.println(String.format("rawSet: '%s'", rawSet)); System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl)); + System.out.println(String.format("rawSet: '%s'", rawSet)); System.out.println(String.format("actionSetId: '%s'", actionSetId)); System.out.println(String.format("workingPath: '%s'", workingPath)); @@ -84,14 +84,17 @@ public class SparkCreateSimRels implements Serializable { .mode("overwrite") .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); - //create atomic actions - JavaRDD> newSimRels = relationsRDD - .map(this::createSequenceFileRow); + if (rawSet != null) { + //create atomic actions + JavaRDD> newSimRels = relationsRDD + .map(this::createSequenceFileRow); - simRel = simRel.union(newSimRels); + simRel = simRel.union(newSimRels); + } } - simRel.mapToPair(r -> r) + if (rawSet != null) + simRel.mapToPair(r -> r) .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java index dd079e4cdc..b8b41d217b 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -21,8 +21,9 @@ import org.apache.spark.sql.SparkSession; import scala.Tuple2; import java.io.IOException; +import java.io.Serializable; -public class SparkUpdateEntity { +public class SparkUpdateEntity implements Serializable { final String IDJSONPATH = "$.id"; @@ -82,6 +83,7 @@ public class SparkUpdateEntity { JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1()); sourceEntity = map.union(dedupEntity); + } sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class); diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml index e14fa7c55e..4386b2ea1f 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml @@ -4,14 +4,6 @@ graphBasePath the raw graph base path - - isLookUpUrl - the address of the lookUp service - - - actionSetId - id of the actionSet - workingPath path of the working directory @@ -34,6 +26,21 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + @@ -45,11 +52,9 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster - Create Dedup Record + Update Entity eu.dnetlib.dhp.dedup.SparkUpdateEntity dhp-dedup-${projectVersion}.jar @@ -74,8 +79,6 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster Update Relations diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json index bcd2ff974c..42ef2b78e3 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json @@ -17,12 +17,6 @@ "paramDescription": "the base path of the raw graph", "paramRequired": true }, - { - "paramName": "o", - "paramLongName": "rawSet", - "paramDescription": "the raw set to be saved (full path)", - "paramRequired": true - }, { "paramName": "la", "paramLongName": "isLookUpUrl", diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json index b8c8af699a..9eb08a29b3 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -27,7 +27,7 @@ "paramName": "o", "paramLongName": "rawSet", "paramDescription": "the raw set to be saved (full path)", - "paramRequired": true + "paramRequired": false }, { "paramName": "w", diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml deleted file mode 100644 index 2e0ed9aeea..0000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml deleted file mode 100644 index 49b3969953..0000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml +++ /dev/null @@ -1,115 +0,0 @@ - - - - graphBasePath - the raw graph base path - - - isLookUpUrl - the address of the lookUp service - - - actionSetId - id of the actionSet - - - workingPath - path of the working directory - - - dedupGraphPath - path of the dedup graph - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - yarn-cluster - cluster - Create Merge Relations - eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent - dhp-dedup-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" - --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - --conf spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --i${graphBasePath} - --w${workingPath} - --la${isLookUpUrl} - --asi${actionSetId} - - - - - - - - - - - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dhp.dedup.SparkCreateDedupRecord - dhp-dedup-${projectVersion}.jar - - --executor-memory ${sparkExecutorMemory} - --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" - --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - --conf spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --i${graphBasePath} - --w${workingPath} - --la${isLookUpUrl} - --asi${actionSetId} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index c4198a5c53..dc22632632 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + graphBasePath @@ -49,13 +49,13 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + @@ -75,11 +75,66 @@ -mtyarn --i${graphBasePath} - --o${rawSet} --la${isLookUpUrl} --asi${actionSetId} --w${workingPath} + + + + + + + + + + yarn-cluster + cluster + Create Merge Relations + eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + + + + + yarn-cluster + cluster + Create Dedup Record + eu.dnetlib.dhp.dedup.SparkCreateDedupRecord + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index ebc1398671..b1be5795e1 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -25,17 +25,14 @@ public class SparkCreateDedupTest { @Test @Ignore - public void createSimRelsTest2() throws Exception { + public void createSimRelsTest() throws Exception { SparkCreateSimRels.main(new String[]{ "-mt", "local[*]", - "-s", "/Users/miconis/dumps", - "-e", entity, - "-c", ArgumentApplicationParser.compressArgument(configuration), - "-rs", "/tmp/dedup/rawset_test", - "-ai", "agentId", - "-an", "agentName", + "-i", "/Users/miconis/dumps", + "-o", "/tmp/dedup/rawset_test", "-asi", "dedup-similarity-result-levenstein", "-la", "lookupurl", + "-w", "workingPath" }); } From 02320de37120b3c07965fffae91fa48c5dc2a868 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 24 Mar 2020 17:43:51 +0100 Subject: [PATCH 15/17] minor changes --- .../dhp/dedup/SparkCreateDedupTest.java | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index b1be5795e1..8f1e3b0ae8 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -80,24 +80,4 @@ public class SparkCreateDedupTest { System.out.println(s2.hashCode()); System.out.println(hashFunction.hashString(s2).asLong()); } - - @Test - public void fileExistsTest() throws IOException { - - boolean result = false; - - FileSystem fileSystem = FileSystem.get(new Configuration()); - - FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/tmp")); - - for (FileStatus fs : fileStatuses) { - if (fs.isDirectory()) { - if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio")))) { - System.out.println("fs = " + DedupUtility.createMergeRelPath("/tmp", fs.getPath().getName(), "cicciopasticcio")); - result = true; - } - } - } - - } } From efb0b7d6605a5e62963eecf0159ef374a2792629 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Mar 2020 11:15:35 +0100 Subject: [PATCH 16/17] master set to 'yarn' in spark actions --- .../eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml | 4 ++-- .../eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml index 4386b2ea1f..d3121ea77a 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml @@ -52,7 +52,7 @@ - yarn-cluster + yarn cluster Update Entity eu.dnetlib.dhp.dedup.SparkUpdateEntity @@ -79,7 +79,7 @@ - yarn-cluster + yarn cluster Update Relations eu.dnetlib.dhp.dedup.SparkPropagateRelation diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index dc22632632..abd1528573 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -88,7 +88,7 @@ - yarn-cluster + yarn cluster Create Merge Relations eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent @@ -116,7 +116,7 @@ - yarn-cluster + yarn cluster Create Dedup Record eu.dnetlib.dhp.dedup.SparkCreateDedupRecord From 36f8f2ea66412f909cd8983cc0718b0ab6e10641 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 25 Mar 2020 14:16:06 +0100 Subject: [PATCH 17/17] master set to 'yarn' in spark actions, removed path to rawSet from the dedup scan workflow --- .../dnetlib/dhp/dedup/SparkCreateSimRels.java | 24 +++++-------------- .../dedup/consistency/oozie_app/workflow.xml | 4 ++-- .../dhp/dedup/createSimRels_parameters.json | 6 ----- .../dhp/dedup/scan/oozie_app/workflow.xml | 9 ++----- 4 files changed, 10 insertions(+), 33 deletions(-) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java index 8c0efdcad7..0fc72db1e1 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -43,22 +43,17 @@ public class SparkCreateSimRels implements Serializable { //read oozie parameters final String graphBasePath = parser.get("graphBasePath"); final String isLookUpUrl = parser.get("isLookUpUrl"); - final String rawSet = parser.get("rawSet"); final String actionSetId = parser.get("actionSetId"); final String workingPath = parser.get("workingPath"); System.out.println(String.format("graphBasePath: '%s'", graphBasePath)); System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl)); - System.out.println(String.format("rawSet: '%s'", rawSet)); System.out.println(String.format("actionSetId: '%s'", actionSetId)); System.out.println(String.format("workingPath: '%s'", workingPath)); try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - //create empty sequenceFile for the accumulation - JavaRDD> simRel = sc.emptyRDD(); - //for each dedup configuration for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { final String entity = dedupConf.getWf().getEntityType(); @@ -83,23 +78,16 @@ public class SparkCreateSimRels implements Serializable { .write() .mode("overwrite") .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); - - if (rawSet != null) { - //create atomic actions - JavaRDD> newSimRels = relationsRDD - .map(this::createSequenceFileRow); - - simRel = simRel.union(newSimRels); - } } - - if (rawSet != null) - simRel.mapToPair(r -> r) - .saveAsHadoopFile(rawSet, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); } - } + /** + * Utility method used to create an atomic action from a Relation object + * @param relation input relation + * @return A tuple2 with [id, json serialization of the atomic action] + * @throws JsonProcessingException + */ public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { ObjectMapper mapper = new ObjectMapper(); diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml index d3121ea77a..d481a6cfb8 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml @@ -65,7 +65,7 @@ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --w${workingPath} --o${dedupGraphPath} @@ -92,7 +92,7 @@ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --o${dedupGraphPath} --w${workingPath} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json index 9eb08a29b3..8cffa86dc8 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -23,12 +23,6 @@ "paramDescription": "the base path of the raw graph", "paramRequired": true }, - { - "paramName": "o", - "paramLongName": "rawSet", - "paramDescription": "the raw set to be saved (full path)", - "paramRequired": false - }, { "paramName": "w", "paramLongName": "workingPath", diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index abd1528573..e2c7f425b5 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -4,10 +4,6 @@ graphBasePath the raw graph base path - - rawSet - the output directory in the targetPath - isLookUpUrl the address of the lookUp service @@ -58,7 +54,6 @@ - yarn @@ -101,7 +96,7 @@ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --w${workingPath} --la${isLookUpUrl} @@ -129,7 +124,7 @@ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --w${workingPath} --la${isLookUpUrl}