From f32eae5ce9e130c0d1ddd893696d20a773b4f33f Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 18 Mar 2020 14:27:49 +0100 Subject: [PATCH] implementation of the spark action for the simrel creation --- dhp-workflows/dhp-dedup/pom.xml | 6 +- .../eu/dnetlib/dedup/SparkCreateSimRels2.java | 142 +++++++ .../dnetlib/dhp/dedup/dedup_parameters.json | 27 +- .../dhp/dedup/oozie_app/DuplicateScanWf.xml | 88 ++++ .../dnetlib/dedup/SparkCreateDedupTest.java | 26 +- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 14 +- .../eu/dnetlib/dedup/conf/pub.curr.conf.json | 168 ++++++-- .../dnetlib/dedup/conf/pub_dt.curr.conf.json | 386 ------------------ pom.xml | 16 + 9 files changed, 426 insertions(+), 447 deletions(-) create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml delete mode 100644 dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 0721af25d..cc27952fa 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -82,8 +82,10 @@ com.fasterxml.jackson.core jackson-core - - + + eu.dnetlib + dnet-actionmanager-common + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java new file mode 100644 index 000000000..3fa7be3f7 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java @@ -0,0 +1,142 @@ +package eu.dnetlib.dedup; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; +import eu.dnetlib.actionmanager.actions.AtomicAction; +import eu.dnetlib.actionmanager.common.Agent; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +public class SparkCreateSimRels2 implements Serializable { + + final static String CONF_SEPARATOR = "@@@"; + + private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class); + + public static List decompressConfs(String compressedConfs){ + + return Arrays.stream(compressedConfs.split(CONF_SEPARATOR)) + .map(ArgumentApplicationParser::decompressValue) + .map(DedupConfig::load) + .collect(Collectors.toList()); + } + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); + + parser.parseArgument(args); + + new SparkCreateSimRels2().run(parser, decompressConfs(parser.get("dedupConf"))); + } + + private void run(ArgumentApplicationParser parser, List dedupConfs) { + + //read oozie parameters + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String rawSetName = parser.get("rawSet"); + final String agentId = parser.get("agentId"); + final String agentName = parser.get("agentName"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //create empty sequenceFile for the accumulation + JavaRDD> simRel = sc.emptyRDD(); + + //for each dedup configuration + for (DedupConfig dedupConf: dedupConfs) { + final String entity = dedupConf.getWf().getEntityType(); + + JavaPairRDD mapDocument = sc.textFile(sourcePath + "/" + entity) + .mapToPair(s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + //create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + + //create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2())); + + //create atomic actions + JavaRDD> newSimRels = relationsRDD + .mapToPair(rel -> + new Tuple2<>( + createActionId(rel.getSource(), rel.getTarget(), entity), //TODO update the type, maybe take it from the configuration? + new AtomicAction(rawSetName, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes()))) + .map(aa -> new Tuple2<>(aa._1(), transformAction(aa._2()))); + + simRel = simRel.union(newSimRels); + + } + + String targetDirectory = targetPath + "/" + rawSetName; + +// simRel.map(s -> s._1().toString()).saveAsTextFile(targetDirectory); + + simRel.mapToPair(r -> r) + .saveAsHadoopFile(targetDirectory, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + + } + + } + + public Text createActionId(String source, String target, String type) { + String id = source + "@" + type + "@" + target; + + return new Text(id); + } + + public Text transformAction(AtomicAction aa) throws JsonProcessingException { + + ObjectMapper mapper = new ObjectMapper(); + + return new Text(mapper.writeValueAsString(aa)); + } + + public Relation createSimRel(String source, String target){ + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass("isSimilarTo"); + return r; + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels2.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) +// .enableHiveSupport() + .getOrCreate(); + } + +} diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json index 8ba8515d0..9bdddef8a 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json @@ -8,26 +8,43 @@ { "paramName": "s", "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", + "paramDescription": "the base path of the raw graph", "paramRequired": true }, { "paramName": "e", "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", + "paramDescription": "the type of entity to be deduped (directory in the sourcePath)", "paramRequired": true }, { "paramName": "c", "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, + "paramDescription": "list of dedup configuration to be used", "paramRequired": true }, { "paramName": "t", "paramLongName": "targetPath", - "paramDescription": "target path to save dedup result", + "paramDescription": "target base path to save dedup result (actions)", + "paramRequired": true + }, + { + "paramName": "rs", + "paramLongName": "rawSet", + "paramDescription": "the raw set to be saved (directory in the targetPath)", + "paramRequired": true + }, + { + "paramName": "ai", + "paramLongName": "agentId", + "paramDescription": "the agent identifier", + "paramRequired": true + }, + { + "paramName": "an", + "paramLongName": "agentName", + "paramDescription": "the agent name", "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml new file mode 100644 index 000000000..1dede2c70 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml @@ -0,0 +1,88 @@ + + + + sourcePath + the raw graph base path + + + entity + the entity that should be processed + + + dedupConf + the (list of) dedup Configuration(s) + + + targetPath + the output base path + + + rawSet + the output directory in the targetPath + + + agentId + the agent identifier + + + agentName + the agent name + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + Create Similarity Relations + eu.dnetlib.dedup.SparkCreateSimRels2 + dhp-dedup-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} --conf + spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf + spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf + spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn-cluster + --sourcePath${sourcePath} + --targetPath${targetPath} + --entity${entity} + --dedupConf${dedupConf} + --rawSet${rawSet} + --agentId${agentId} + --agentName${agentName} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index f93703e37..12bba7c1e 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -13,17 +13,20 @@ import org.junit.Test; import java.io.File; import java.io.IOException; +import java.util.HashSet; import java.util.List; +import java.util.Set; public class SparkCreateDedupTest { String configuration; - String entity = "organization"; + String configuration2; + String entity = "publication"; @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); - + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org1.curr.conf.json")); + configuration2 = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org2.curr.conf.json")); } @Test @@ -38,6 +41,21 @@ public class SparkCreateDedupTest { }); } + @Test + @Ignore + public void createSimRelsTest2() throws Exception { + SparkCreateSimRels2.main(new String[] { + "-mt", "local[*]", + "-s", "/Users/miconis/dumps", + "-e", entity, + "-c", ArgumentApplicationParser.compressArgument(configuration) + "@@@" + ArgumentApplicationParser.compressArgument(configuration2), + "-t", "/tmp/dedup", + "-rs", "rawset_test", + "-ai", "agentId", + "-an", "agentName" + }); + } + @Test @Ignore public void createCCTest() throws Exception { @@ -79,8 +97,6 @@ public class SparkCreateDedupTest { System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); System.out.println( s2.hashCode()); System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); - } - } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 2d0905562..31b200c72 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -87,8 +87,8 @@ } } ], - "threshold": 0.7, - "aggregation": "W_MEAN", + "threshold": 0.1, + "aggregation": "AVG", "positive": "layer4", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -106,7 +106,7 @@ } } ], - "threshold": 0.9, + "threshold": 0.7, "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", @@ -129,7 +129,9 @@ "comparator": "jaroWinklerNormalizedName", "weight": 0.1, "countIfUndefined": "false", - "params": {} + "params": { + "windowSize": 4 + } } ], "threshold": 0.9, @@ -145,14 +147,14 @@ { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json index 3e861fb71..d471ccb89 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json @@ -1,42 +1,134 @@ { - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "subEntityType" : "resulttype", - "subEntityValue" : "publication", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "100", - "maxChildren" : "100", - "idPath": "$.id", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], - "includeChildren" : "true" + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "2000", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "200", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.id" }, - "pace" : { + "pace": { "clustering" : [ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], - "strictConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + } + ], + "threshold": 0.5, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "AND", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "levensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "AVG", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.pid", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } ], - "conditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } - ], - "model" : [ - { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" }, - { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 }, - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 }, - { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" } - ], - "synonyms": {}, - "blacklists" : { - "title" : [ + "blacklists": { + "title": [ "^Inside Front Cover$", "(?i)^Poster presentations$", "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", @@ -48,7 +140,6 @@ "^Cartas? ao editor Letters? to the Editor$", "^Note from the Editor$", "^Anesthesia Abstract$", - "^Annual report$", "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", "(?i)^Graph and Table of Infectious Diseases?$", @@ -68,14 +159,12 @@ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", "^Gushi hakubutsugaku$", - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", "^Intestinal spirocha?etosis$", "^Treatment of Rodent Ulcer$", "(?i)^\\W*Cloud Computing\\W*$", "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", "^Free Communications, Poster Presentations: Session [A-F]$", - "^“The Historical Aspects? of Quackery\\.?”$", "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", @@ -96,10 +185,8 @@ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", "^Aus der AGMB$", - "^Znanstveno-stručni prilozi$", "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", @@ -136,7 +223,6 @@ "(?i)^RUBRIKA UREDNIKA$", "^A Matching Model of the Academic Publication Market$", "^Yōgaku kōyō$", - "^Internetový marketing$", "^Internet marketing$", "^Chūtō kokugo dokuhon$", @@ -169,21 +255,17 @@ "^Information System Assessment and Proposal for ICT Modification$", "^Stresové zatížení pracovníků ve vybrané profesi$", "^Stress load in a specific job$", - "^Sunday: Poster Sessions, Pt.*$", "^Monday: Poster Sessions, Pt.*$", "^Wednesday: Poster Sessions, Pt.*", "^Tuesday: Poster Sessions, Pt.*$", - "^Analýza reklamy$", "^Analysis of advertising$", - "^Shōgaku shūshinsho$", "^Shōgaku sansū$", "^Shintei joshi kokubun$", "^Taishō joshi kokubun dokuhon$", "^Joshi kokubun$", - "^Účetní uzávěrka a účetní závěrka v ČR$", "(?i)^The \"?Causes\"? of Cancer$", "^Normas para la publicación de artículos$", @@ -202,7 +284,6 @@ "^Abdominal [Aa]ortic [Aa]neurysms.*$", "^Pseudomyxoma peritonei$", "^Kazalo autora$", - "(?i)^uvodna riječ$", "^Motivace jako způsob vedení lidí$", "^Motivation as a leadership$", @@ -275,6 +356,7 @@ "(?i)^.*authors['’′]? reply\\.?$", "(?i)^.*authors['’′]? response\\.?$" ] - } + }, + "synonyms": {} } } \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json deleted file mode 100644 index 6ca0ecd53..000000000 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json +++ /dev/null @@ -1,386 +0,0 @@ -{ - "wf": { - "threshold": "0.99", - "dedupRun": "001", - "entityType": "result", - "subEntityType": "resulttype", - "subEntityValue": "publication", - "orderField": "title", - "queueMaxSize": "2000", - "groupMaxSize": "100", - "maxChildren": "100", - "slidingWindowSize": "200", - "rootBuilder": [ - "result", - "resultProject_outcome_isProducedBy", - "resultResult_publicationDataset_isRelatedTo", - "resultResult_similarity_isAmongTopNSimilarDocuments", - "resultResult_similarity_hasAmongTopNSimilarDocuments", - "resultOrganization_affiliation_isAffiliatedWith", - "resultResult_part_hasPart", - "resultResult_part_isPartOf", - "resultResult_supplement_isSupplementTo", - "resultResult_supplement_isSupplementedBy", - "resultResult_version_isVersionOf" - ], - "includeChildren": "true", - "maxIterations": 20, - "idPath": "$.id" - }, - "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - }, - { - "name": "lowercase", - "fields": [ - "doi" - ], - "params": {} - } - ], - "decisionTree": { - "start": { - "fields": [ - { - "field": "pid", - "comparator": "jsonListMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": { - "jpath_value": "$.value", - "jpath_classid": "$.qualifier.classid" - } - } - ], - "threshold": 0.5, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "layer2", - "undefined": "layer2", - "ignoreUndefined": "true" - }, - "layer2": { - "fields": [ - { - "field": "title", - "comparator": "titleVersionMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - }, - { - "field": "authors", - "comparator": "sizeMatch", - "weight": 1.0, - "countIfUndefined": "false", - "params": {} - } - ], - "threshold": 1.0, - "aggregation": "AND", - "positive": "layer3", - "negative": "NO_MATCH", - "undefined": "layer3", - "ignoreUndefined": "false" - }, - "layer3": { - "fields": [ - { - "field": "title", - "comparator": "levensteinTitle", - "weight": 1.0, - "countIfUndefined": "true", - "params": {} - } - ], - "threshold": 0.99, - "aggregation": "AVG", - "positive": "MATCH", - "negative": "NO_MATCH", - "undefined": "NO_MATCH", - "ignoreUndefined": "true" - } - }, - "model": [ - { - "name": "doi", - "type": "String", - "path": "$.pid[?(@.qualifier.classid == 'doi')].value" - }, - { - "name": "pid", - "type": "JSON", - "path": "$.pid", - "overrideMatch": "true" - }, - { - "name": "title", - "type": "String", - "path": "$.title[?(@.qualifier.classid == 'main title')].value", - "length": 250, - "size": 5 - }, - { - "name": "authors", - "type": "List", - "path": "$.author[*].fullname", - "size": 200 - }, - { - "name": "resulttype", - "type": "String", - "path": "$.resulttype.classid" - } - ], - "blacklists": { - "title": [ - "^Inside Front Cover$", - "(?i)^Poster presentations$", - "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", - "^Problems with perinatal pathology\\.?$", - "(?i)^Cases? of Puerperal Convulsions$", - "(?i)^Operative Gyna?ecology$", - "(?i)^Mind the gap\\!?\\:?$", - "^Chronic fatigue syndrome\\.?$", - "^Cartas? ao editor Letters? to the Editor$", - "^Note from the Editor$", - "^Anesthesia Abstract$", - "^Annual report$", - "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", - "(?i)^Graph and Table of Infectious Diseases?$", - "^Presentation$", - "(?i)^Reviews and Information on Publications$", - "(?i)^PUBLIC HEALTH SERVICES?$", - "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", - "(?i)^Adrese autora$", - "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", - "(?i)^Acknowledgement to Referees$", - "(?i)^Behçet's disease\\.?$", - "(?i)^Isolation and identification of restriction endonuclease.*$", - "(?i)^CEREBROVASCULAR DISEASES?.?$", - "(?i)^Screening for abdominal aortic aneurysms?\\.?$", - "^Event management$", - "(?i)^Breakfast and Crohn's disease.*\\.?$", - "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", - "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", - "^Gushi hakubutsugaku$", - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", - "^Intestinal spirocha?etosis$", - "^Treatment of Rodent Ulcer$", - "(?i)^\\W*Cloud Computing\\W*$", - "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", - "^Free Communications, Poster Presentations: Session [A-F]$", - "^“The Historical Aspects? of Quackery\\.?”$", - "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", - "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", - "(?i)^Case Report$", - "^Boletín Informativo$", - "(?i)^Glioblastoma Multiforme$", - "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", - "^Zaměstnanecké výhody$", - "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", - "(?i)^Carotid body tumours?\\.?$", - "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", - "^Avant-propos$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", - "(?i)^PUBLIC HEALTH VERSUS THE STATE$", - "^Viñetas de Cortázar$", - "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", - "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", - "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", - "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", - "^Aus der AGMB$", - "^Znanstveno-stručni prilozi$", - "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", - "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", - "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", - "^Finanční analýza podniku$", - "^Financial analysis( of business)?$", - "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", - "^Jikken nihon shūshinsho$", - "(?i)^CORONER('|s)(s|') INQUESTS$", - "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", - "(?i)^Consultants' contract(s)?$", - "(?i)^Upute autorima$", - "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", - "^Joshi shin kokubun$", - "^Kōtō shōgaku dokuhon nōson'yō$", - "^Jinjō shōgaku shōka$", - "^Shōgaku shūjichō$", - "^Nihon joshi dokuhon$", - "^Joshi shin dokuhon$", - "^Chūtō kanbun dokuhon$", - "^Wabun dokuhon$", - "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", - "(?i)^cardiac rehabilitation$", - "(?i)^Analytical summary$", - "^Thesaurus resolutionum Sacrae Congregationis Concilii$", - "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", - "^Prikazi i osvrti$", - "^Rodinný dům s provozovnou$", - "^Family house with an establishment$", - "^Shinsei chūtō shin kokugun$", - "^Pulmonary alveolar proteinosis(\\.?)$", - "^Shinshū kanbun$", - "^Viñeta(s?) de Rodríguez$", - "(?i)^RUBRIKA UREDNIKA$", - "^A Matching Model of the Academic Publication Market$", - "^Yōgaku kōyō$", - "^Internetový marketing$", - "^Internet marketing$", - "^Chūtō kokugo dokuhon$", - "^Kokugo dokuhon$", - "^Antibiotic Cover for Dental Extraction(s?)$", - "^Strategie podniku$", - "^Strategy of an Enterprise$", - "(?i)^respiratory disease(s?)(\\.?)$", - "^Award(s?) for Gallantry in Civil Defence$", - "^Podniková kultura$", - "^Corporate Culture$", - "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", - "^Pracovní motivace$", - "^Work Motivation$", - "^Kaitei kōtō jogaku dokuhon$", - "^Konsolidovaná účetní závěrka$", - "^Consolidated Financial Statements$", - "(?i)^intracranial tumour(s?)$", - "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", - "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", - "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", - "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", - "^Úroveň motivačního procesu jako způsobu vedení lidí$", - "^The level of motivation process as a leadership$", - "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", - "(?i)^news and events$", - "(?i)^NOVOSTI I DOGAĐAJI$", - "^Sansū no gakushū$", - "^Posouzení informačního systému firmy a návrh změn$", - "^Information System Assessment and Proposal for ICT Modification$", - "^Stresové zatížení pracovníků ve vybrané profesi$", - "^Stress load in a specific job$", - "^Sunday: Poster Sessions, Pt.*$", - "^Monday: Poster Sessions, Pt.*$", - "^Wednesday: Poster Sessions, Pt.*", - "^Tuesday: Poster Sessions, Pt.*$", - "^Analýza reklamy$", - "^Analysis of advertising$", - "^Shōgaku shūshinsho$", - "^Shōgaku sansū$", - "^Shintei joshi kokubun$", - "^Taishō joshi kokubun dokuhon$", - "^Joshi kokubun$", - "^Účetní uzávěrka a účetní závěrka v ČR$", - "(?i)^The \"?Causes\"? of Cancer$", - "^Normas para la publicación de artículos$", - "^Editor('|s)(s|') [Rr]eply$", - "^Editor(’|s)(s|’) letter$", - "^Redaktoriaus žodis$", - "^DISCUSSION ON THE PRECEDING PAPER$", - "^Kōtō shōgaku shūshinsho jidōyō$", - "^Shōgaku nihon rekishi$", - "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", - "^Préface$", - "^Occupational [Hh]ealth [Ss]ervices.$", - "^In Memoriam Professor Toshiyuki TAKESHIMA$", - "^Účetní závěrka ve vybraném podniku.*$", - "^Financial statements in selected company$", - "^Abdominal [Aa]ortic [Aa]neurysms.*$", - "^Pseudomyxoma peritonei$", - "^Kazalo autora$", - "(?i)^uvodna riječ$", - "^Motivace jako způsob vedení lidí$", - "^Motivation as a leadership$", - "^Polyfunkční dům$", - "^Multi\\-funkcional building$", - "^Podnikatelský plán$", - "(?i)^Podnikatelský záměr$", - "(?i)^Business Plan$", - "^Oceňování nemovitostí$", - "^Marketingová komunikace$", - "^Marketing communication$", - "^Sumario Analítico$", - "^Riječ uredništva$", - "^Savjetovanja i priredbe$", - "^Índice$", - "^(Starobosanski nadpisi).*$", - "^Vzdělávání pracovníků v organizaci$", - "^Staff training in organization$", - "^(Life Histories of North American Geometridae).*$", - "^Strategická analýza podniku$", - "^Strategic Analysis of an Enterprise$", - "^Sadržaj$", - "^Upute suradnicima$", - "^Rodinný dům$", - "(?i)^Fami(l)?ly house$", - "^Upute autorima$", - "^Strategic Analysis$", - "^Finanční analýza vybraného podniku$", - "^Finanční analýza$", - "^Riječ urednika$", - "(?i)^Content(s?)$", - "(?i)^Inhalt$", - "^Jinjō shōgaku shūshinsho jidōyō$", - "(?i)^Index$", - "^Chūgaku kokubun kyōkasho$", - "^Retrato de una mujer$", - "^Retrato de un hombre$", - "^Kōtō shōgaku dokuhon$", - "^Shotōka kokugo$", - "^Shōgaku dokuhon$", - "^Jinjō shōgaku kokugo dokuhon$", - "^Shinsei kokugo dokuhon$", - "^Teikoku dokuhon$", - "^Instructions to Authors$", - "^KİTAP TAHLİLİ$", - "^PRZEGLĄD PIŚMIENNICTWA$", - "(?i)^Presentación$", - "^İçindekiler$", - "(?i)^Tabl?e of contents$", - "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", - "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", - "^Editorial( Board)?$", - "(?i)^Editorial \\(English\\)$", - "^Editörden$", - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*", - "(?i)^.*authors['’′]? reply\\.?$", - "(?i)^.*authors['’′]? response\\.?$" - ] - }, - "synonyms": {} - } -} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 0310a3f44..fe158d9fc 100644 --- a/pom.xml +++ b/pom.xml @@ -345,6 +345,22 @@ + + + eu.dnetlib + dnet-actionmanager-common + [6.0.0,7.0.0) + + + commons-httpclient + commons-httpclient + + + eu.dnetlib + dnet-openaireplus-mapping-utils + + +