diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml
index 0721af25d..cc27952fa 100644
--- a/dhp-workflows/dhp-dedup/pom.xml
+++ b/dhp-workflows/dhp-dedup/pom.xml
@@ -82,8 +82,10 @@
com.fasterxml.jackson.core
jackson-core
-
-
+
+ eu.dnetlib
+ dnet-actionmanager-common
+
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java
new file mode 100644
index 000000000..3fa7be3f7
--- /dev/null
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels2.java
@@ -0,0 +1,142 @@
+package eu.dnetlib.dedup;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+import eu.dnetlib.actionmanager.actions.AtomicAction;
+import eu.dnetlib.actionmanager.common.Agent;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class SparkCreateSimRels2 implements Serializable {
+
+ final static String CONF_SEPARATOR = "@@@";
+
+ private static final Log log = LogFactory.getLog(SparkCreateSimRels2.class);
+
+ public static List decompressConfs(String compressedConfs){
+
+ return Arrays.stream(compressedConfs.split(CONF_SEPARATOR))
+ .map(ArgumentApplicationParser::decompressValue)
+ .map(DedupConfig::load)
+ .collect(Collectors.toList());
+ }
+
+ public static void main(String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
+
+ parser.parseArgument(args);
+
+ new SparkCreateSimRels2().run(parser, decompressConfs(parser.get("dedupConf")));
+ }
+
+ private void run(ArgumentApplicationParser parser, List dedupConfs) {
+
+ //read oozie parameters
+ final String sourcePath = parser.get("sourcePath");
+ final String targetPath = parser.get("targetPath");
+ final String rawSetName = parser.get("rawSet");
+ final String agentId = parser.get("agentId");
+ final String agentName = parser.get("agentName");
+
+ try (SparkSession spark = getSparkSession(parser)) {
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+ //create empty sequenceFile for the accumulation
+ JavaRDD> simRel = sc.emptyRDD();
+
+ //for each dedup configuration
+ for (DedupConfig dedupConf: dedupConfs) {
+ final String entity = dedupConf.getWf().getEntityType();
+
+ JavaPairRDD mapDocument = sc.textFile(sourcePath + "/" + entity)
+ .mapToPair(s -> {
+ MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
+ return new Tuple2<>(d.getIdentifier(), d);
+ });
+
+ //create blocks for deduplication
+ JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
+
+ //create relations by comparing only elements in the same group
+ final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
+
+ JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2()));
+
+ //create atomic actions
+ JavaRDD> newSimRels = relationsRDD
+ .mapToPair(rel ->
+ new Tuple2<>(
+ createActionId(rel.getSource(), rel.getTarget(), entity), //TODO update the type, maybe take it from the configuration?
+ new AtomicAction(rawSetName, new Agent(agentId, agentName, Agent.AGENT_TYPE.service), rel.getSource(), "isSimilarTo", rel.getTarget(), new ObjectMapper().writeValueAsString(rel).getBytes())))
+ .map(aa -> new Tuple2<>(aa._1(), transformAction(aa._2())));
+
+ simRel = simRel.union(newSimRels);
+
+ }
+
+ String targetDirectory = targetPath + "/" + rawSetName;
+
+// simRel.map(s -> s._1().toString()).saveAsTextFile(targetDirectory);
+
+ simRel.mapToPair(r -> r)
+ .saveAsHadoopFile(targetDirectory, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
+ }
+
+ }
+
+ public Text createActionId(String source, String target, String type) {
+ String id = source + "@" + type + "@" + target;
+
+ return new Text(id);
+ }
+
+ public Text transformAction(AtomicAction aa) throws JsonProcessingException {
+
+ ObjectMapper mapper = new ObjectMapper();
+
+ return new Text(mapper.writeValueAsString(aa));
+ }
+
+ public Relation createSimRel(String source, String target){
+ final Relation r = new Relation();
+ r.setSource(source);
+ r.setTarget(target);
+ r.setRelClass("isSimilarTo");
+ return r;
+ }
+
+ private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
+ SparkConf conf = new SparkConf();
+
+ return SparkSession
+ .builder()
+ .appName(SparkCreateSimRels2.class.getSimpleName())
+ .master(parser.get("master"))
+ .config(conf)
+// .enableHiveSupport()
+ .getOrCreate();
+ }
+
+}
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
index 8ba8515d0..9bdddef8a 100644
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json
@@ -8,26 +8,43 @@
{
"paramName": "s",
"paramLongName": "sourcePath",
- "paramDescription": "the path of the sequential file to read",
+ "paramDescription": "the base path of the raw graph",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
- "paramDescription": "the type of entity to be deduped",
+ "paramDescription": "the type of entity to be deduped (directory in the sourcePath)",
"paramRequired": true
},
{
"paramName": "c",
"paramLongName": "dedupConf",
- "paramDescription": "dedup configuration to be used",
- "compressed": true,
+ "paramDescription": "list of dedup configuration to be used",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
- "paramDescription": "target path to save dedup result",
+ "paramDescription": "target base path to save dedup result (actions)",
+ "paramRequired": true
+ },
+ {
+ "paramName": "rs",
+ "paramLongName": "rawSet",
+ "paramDescription": "the raw set to be saved (directory in the targetPath)",
+ "paramRequired": true
+ },
+ {
+ "paramName": "ai",
+ "paramLongName": "agentId",
+ "paramDescription": "the agent identifier",
+ "paramRequired": true
+ },
+ {
+ "paramName": "an",
+ "paramLongName": "agentName",
+ "paramDescription": "the agent name",
"paramRequired": true
}
]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml
new file mode 100644
index 000000000..1dede2c70
--- /dev/null
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml
@@ -0,0 +1,88 @@
+
+
+
+ sourcePath
+ the raw graph base path
+
+
+ entity
+ the entity that should be processed
+
+
+ dedupConf
+ the (list of) dedup Configuration(s)
+
+
+ targetPath
+ the output base path
+
+
+ rawSet
+ the output directory in the targetPath
+
+
+ agentId
+ the agent identifier
+
+
+ agentName
+ the agent name
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ Create Similarity Relations
+ eu.dnetlib.dedup.SparkCreateSimRels2
+ dhp-dedup-${projectVersion}.jar
+ --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory} --conf
+ spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
+ spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
+ spark.sql.warehouse.dir="/user/hive/warehouse"
+
+ -mtyarn-cluster
+ --sourcePath${sourcePath}
+ --targetPath${targetPath}
+ --entity${entity}
+ --dedupConf${dedupConf}
+ --rawSet${rawSet}
+ --agentId${agentId}
+ --agentName${agentName}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
index f93703e37..12bba7c1e 100644
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
@@ -13,17 +13,20 @@ import org.junit.Test;
import java.io.File;
import java.io.IOException;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
public class SparkCreateDedupTest {
String configuration;
- String entity = "organization";
+ String configuration2;
+ String entity = "publication";
@Before
public void setUp() throws IOException {
- configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
-
+ configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org1.curr.conf.json"));
+ configuration2 = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org2.curr.conf.json"));
}
@Test
@@ -38,6 +41,21 @@ public class SparkCreateDedupTest {
});
}
+ @Test
+ @Ignore
+ public void createSimRelsTest2() throws Exception {
+ SparkCreateSimRels2.main(new String[] {
+ "-mt", "local[*]",
+ "-s", "/Users/miconis/dumps",
+ "-e", entity,
+ "-c", ArgumentApplicationParser.compressArgument(configuration) + "@@@" + ArgumentApplicationParser.compressArgument(configuration2),
+ "-t", "/tmp/dedup",
+ "-rs", "rawset_test",
+ "-ai", "agentId",
+ "-an", "agentName"
+ });
+ }
+
@Test
@Ignore
public void createCCTest() throws Exception {
@@ -79,8 +97,6 @@ public class SparkCreateDedupTest {
System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
System.out.println( s2.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
-
}
-
}
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
index 2d0905562..31b200c72 100644
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
@@ -87,8 +87,8 @@
}
}
],
- "threshold": 0.7,
- "aggregation": "W_MEAN",
+ "threshold": 0.1,
+ "aggregation": "AVG",
"positive": "layer4",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@@ -106,7 +106,7 @@
}
}
],
- "threshold": 0.9,
+ "threshold": 0.7,
"aggregation": "AVG",
"positive": "layer5",
"negative": "NO_MATCH",
@@ -129,7 +129,9 @@
"comparator": "jaroWinklerNormalizedName",
"weight": 0.1,
"countIfUndefined": "false",
- "params": {}
+ "params": {
+ "windowSize": 4
+ }
}
],
"threshold": 0.9,
@@ -145,14 +147,14 @@
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
- { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
+ { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
- "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
+ "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
index 3e861fb71..d471ccb89 100644
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
+++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
@@ -1,42 +1,134 @@
{
- "wf" : {
- "threshold" : "0.99",
- "dedupRun" : "001",
- "entityType" : "result",
- "subEntityType" : "resulttype",
- "subEntityValue" : "publication",
- "orderField" : "title",
- "queueMaxSize" : "2000",
- "groupMaxSize" : "100",
- "maxChildren" : "100",
- "idPath": "$.id",
- "slidingWindowSize" : "200",
- "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
- "includeChildren" : "true"
+ "wf": {
+ "threshold": "0.99",
+ "dedupRun": "001",
+ "entityType": "result",
+ "subEntityType": "resulttype",
+ "subEntityValue": "publication",
+ "orderField": "title",
+ "queueMaxSize": "2000",
+ "groupMaxSize": "100",
+ "maxChildren": "100",
+ "slidingWindowSize": "200",
+ "rootBuilder": [
+ "result",
+ "resultProject_outcome_isProducedBy",
+ "resultResult_publicationDataset_isRelatedTo",
+ "resultResult_similarity_isAmongTopNSimilarDocuments",
+ "resultResult_similarity_hasAmongTopNSimilarDocuments",
+ "resultOrganization_affiliation_isAffiliatedWith",
+ "resultResult_part_hasPart",
+ "resultResult_part_isPartOf",
+ "resultResult_supplement_isSupplementTo",
+ "resultResult_supplement_isSupplementedBy",
+ "resultResult_version_isVersionOf"
+ ],
+ "includeChildren": "true",
+ "maxIterations": 20,
+ "idPath": "$.id"
},
- "pace" : {
+ "pace": {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
- "strictConditions" : [
- { "name" : "pidMatch", "fields" : [ "pid" ] }
+ "decisionTree": {
+ "start": {
+ "fields": [
+ {
+ "field": "pid",
+ "comparator": "jsonListMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "jpath_value": "$.value",
+ "jpath_classid": "$.qualifier.classid"
+ }
+ }
+ ],
+ "threshold": 0.5,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "layer2",
+ "undefined": "layer2",
+ "ignoreUndefined": "true"
+ },
+ "layer2": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "titleVersionMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ },
+ {
+ "field": "authors",
+ "comparator": "sizeMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ }
+ ],
+ "threshold": 1.0,
+ "aggregation": "AND",
+ "positive": "layer3",
+ "negative": "NO_MATCH",
+ "undefined": "layer3",
+ "ignoreUndefined": "false"
+ },
+ "layer3": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "levensteinTitle",
+ "weight": 1.0,
+ "countIfUndefined": "true",
+ "params": {}
+ }
+ ],
+ "threshold": 0.99,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "NO_MATCH",
+ "ignoreUndefined": "true"
+ }
+ },
+ "model": [
+ {
+ "name": "doi",
+ "type": "String",
+ "path": "$.pid[?(@.qualifier.classid == 'doi')].value"
+ },
+ {
+ "name": "pid",
+ "type": "JSON",
+ "path": "$.pid",
+ "overrideMatch": "true"
+ },
+ {
+ "name": "title",
+ "type": "String",
+ "path": "$.title[?(@.qualifier.classid == 'main title')].value",
+ "length": 250,
+ "size": 5
+ },
+ {
+ "name": "authors",
+ "type": "List",
+ "path": "$.author[*].fullname",
+ "size": 200
+ },
+ {
+ "name": "resulttype",
+ "type": "String",
+ "path": "$.resulttype.classid"
+ }
],
- "conditions" : [
- { "name" : "titleVersionMatch", "fields" : [ "title" ] },
- { "name" : "sizeMatch", "fields" : [ "authors" ] }
- ],
- "model" : [
- { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" },
- { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" },
- { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 },
- { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 },
- { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" }
- ],
- "synonyms": {},
- "blacklists" : {
- "title" : [
+ "blacklists": {
+ "title": [
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
@@ -48,7 +140,6 @@
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
-
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
@@ -68,14 +159,12 @@
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
-
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
-
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
@@ -96,10 +185,8 @@
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
-
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
-
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
@@ -136,7 +223,6 @@
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
-
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
@@ -169,21 +255,17 @@
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
-
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
-
"^Analýza reklamy$",
"^Analysis of advertising$",
-
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
-
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
@@ -202,7 +284,6 @@
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
-
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
@@ -275,6 +356,7 @@
"(?i)^.*authors['’′]? reply\\.?$",
"(?i)^.*authors['’′]? response\\.?$"
]
- }
+ },
+ "synonyms": {}
}
}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json
deleted file mode 100644
index 6ca0ecd53..000000000
--- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json
+++ /dev/null
@@ -1,386 +0,0 @@
-{
- "wf": {
- "threshold": "0.99",
- "dedupRun": "001",
- "entityType": "result",
- "subEntityType": "resulttype",
- "subEntityValue": "publication",
- "orderField": "title",
- "queueMaxSize": "2000",
- "groupMaxSize": "100",
- "maxChildren": "100",
- "slidingWindowSize": "200",
- "rootBuilder": [
- "result",
- "resultProject_outcome_isProducedBy",
- "resultResult_publicationDataset_isRelatedTo",
- "resultResult_similarity_isAmongTopNSimilarDocuments",
- "resultResult_similarity_hasAmongTopNSimilarDocuments",
- "resultOrganization_affiliation_isAffiliatedWith",
- "resultResult_part_hasPart",
- "resultResult_part_isPartOf",
- "resultResult_supplement_isSupplementTo",
- "resultResult_supplement_isSupplementedBy",
- "resultResult_version_isVersionOf"
- ],
- "includeChildren": "true",
- "maxIterations": 20,
- "idPath": "$.id"
- },
- "pace": {
- "clustering": [
- {
- "name": "ngrampairs",
- "fields": [
- "title"
- ],
- "params": {
- "max": "1",
- "ngramLen": "3"
- }
- },
- {
- "name": "suffixprefix",
- "fields": [
- "title"
- ],
- "params": {
- "max": "1",
- "len": "3"
- }
- },
- {
- "name": "lowercase",
- "fields": [
- "doi"
- ],
- "params": {}
- }
- ],
- "decisionTree": {
- "start": {
- "fields": [
- {
- "field": "pid",
- "comparator": "jsonListMatch",
- "weight": 1.0,
- "countIfUndefined": "false",
- "params": {
- "jpath_value": "$.value",
- "jpath_classid": "$.qualifier.classid"
- }
- }
- ],
- "threshold": 0.5,
- "aggregation": "AVG",
- "positive": "MATCH",
- "negative": "layer2",
- "undefined": "layer2",
- "ignoreUndefined": "true"
- },
- "layer2": {
- "fields": [
- {
- "field": "title",
- "comparator": "titleVersionMatch",
- "weight": 1.0,
- "countIfUndefined": "false",
- "params": {}
- },
- {
- "field": "authors",
- "comparator": "sizeMatch",
- "weight": 1.0,
- "countIfUndefined": "false",
- "params": {}
- }
- ],
- "threshold": 1.0,
- "aggregation": "AND",
- "positive": "layer3",
- "negative": "NO_MATCH",
- "undefined": "layer3",
- "ignoreUndefined": "false"
- },
- "layer3": {
- "fields": [
- {
- "field": "title",
- "comparator": "levensteinTitle",
- "weight": 1.0,
- "countIfUndefined": "true",
- "params": {}
- }
- ],
- "threshold": 0.99,
- "aggregation": "AVG",
- "positive": "MATCH",
- "negative": "NO_MATCH",
- "undefined": "NO_MATCH",
- "ignoreUndefined": "true"
- }
- },
- "model": [
- {
- "name": "doi",
- "type": "String",
- "path": "$.pid[?(@.qualifier.classid == 'doi')].value"
- },
- {
- "name": "pid",
- "type": "JSON",
- "path": "$.pid",
- "overrideMatch": "true"
- },
- {
- "name": "title",
- "type": "String",
- "path": "$.title[?(@.qualifier.classid == 'main title')].value",
- "length": 250,
- "size": 5
- },
- {
- "name": "authors",
- "type": "List",
- "path": "$.author[*].fullname",
- "size": 200
- },
- {
- "name": "resulttype",
- "type": "String",
- "path": "$.resulttype.classid"
- }
- ],
- "blacklists": {
- "title": [
- "^Inside Front Cover$",
- "(?i)^Poster presentations$",
- "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
- "^Problems with perinatal pathology\\.?$",
- "(?i)^Cases? of Puerperal Convulsions$",
- "(?i)^Operative Gyna?ecology$",
- "(?i)^Mind the gap\\!?\\:?$",
- "^Chronic fatigue syndrome\\.?$",
- "^Cartas? ao editor Letters? to the Editor$",
- "^Note from the Editor$",
- "^Anesthesia Abstract$",
- "^Annual report$",
- "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
- "(?i)^Graph and Table of Infectious Diseases?$",
- "^Presentation$",
- "(?i)^Reviews and Information on Publications$",
- "(?i)^PUBLIC HEALTH SERVICES?$",
- "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
- "(?i)^Adrese autora$",
- "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
- "(?i)^Acknowledgement to Referees$",
- "(?i)^Behçet's disease\\.?$",
- "(?i)^Isolation and identification of restriction endonuclease.*$",
- "(?i)^CEREBROVASCULAR DISEASES?.?$",
- "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
- "^Event management$",
- "(?i)^Breakfast and Crohn's disease.*\\.?$",
- "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
- "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
- "^Gushi hakubutsugaku$",
- "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
- "^Intestinal spirocha?etosis$",
- "^Treatment of Rodent Ulcer$",
- "(?i)^\\W*Cloud Computing\\W*$",
- "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
- "^Free Communications, Poster Presentations: Session [A-F]$",
- "^“The Historical Aspects? of Quackery\\.?”$",
- "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
- "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
- "(?i)^Case Report$",
- "^Boletín Informativo$",
- "(?i)^Glioblastoma Multiforme$",
- "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
- "^Zaměstnanecké výhody$",
- "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
- "(?i)^Carotid body tumours?\\.?$",
- "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
- "^Avant-propos$",
- "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
- "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
- "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
- "^Viñetas de Cortázar$",
- "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
- "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
- "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
- "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
- "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
- "^Aus der AGMB$",
- "^Znanstveno-stručni prilozi$",
- "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
- "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
- "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
- "^Finanční analýza podniku$",
- "^Financial analysis( of business)?$",
- "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
- "^Jikken nihon shūshinsho$",
- "(?i)^CORONER('|s)(s|') INQUESTS$",
- "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
- "(?i)^Consultants' contract(s)?$",
- "(?i)^Upute autorima$",
- "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
- "^Joshi shin kokubun$",
- "^Kōtō shōgaku dokuhon nōson'yō$",
- "^Jinjō shōgaku shōka$",
- "^Shōgaku shūjichō$",
- "^Nihon joshi dokuhon$",
- "^Joshi shin dokuhon$",
- "^Chūtō kanbun dokuhon$",
- "^Wabun dokuhon$",
- "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
- "(?i)^cardiac rehabilitation$",
- "(?i)^Analytical summary$",
- "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
- "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
- "^Prikazi i osvrti$",
- "^Rodinný dům s provozovnou$",
- "^Family house with an establishment$",
- "^Shinsei chūtō shin kokugun$",
- "^Pulmonary alveolar proteinosis(\\.?)$",
- "^Shinshū kanbun$",
- "^Viñeta(s?) de Rodríguez$",
- "(?i)^RUBRIKA UREDNIKA$",
- "^A Matching Model of the Academic Publication Market$",
- "^Yōgaku kōyō$",
- "^Internetový marketing$",
- "^Internet marketing$",
- "^Chūtō kokugo dokuhon$",
- "^Kokugo dokuhon$",
- "^Antibiotic Cover for Dental Extraction(s?)$",
- "^Strategie podniku$",
- "^Strategy of an Enterprise$",
- "(?i)^respiratory disease(s?)(\\.?)$",
- "^Award(s?) for Gallantry in Civil Defence$",
- "^Podniková kultura$",
- "^Corporate Culture$",
- "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
- "^Pracovní motivace$",
- "^Work Motivation$",
- "^Kaitei kōtō jogaku dokuhon$",
- "^Konsolidovaná účetní závěrka$",
- "^Consolidated Financial Statements$",
- "(?i)^intracranial tumour(s?)$",
- "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
- "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
- "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
- "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
- "^Úroveň motivačního procesu jako způsobu vedení lidí$",
- "^The level of motivation process as a leadership$",
- "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
- "(?i)^news and events$",
- "(?i)^NOVOSTI I DOGAĐAJI$",
- "^Sansū no gakushū$",
- "^Posouzení informačního systému firmy a návrh změn$",
- "^Information System Assessment and Proposal for ICT Modification$",
- "^Stresové zatížení pracovníků ve vybrané profesi$",
- "^Stress load in a specific job$",
- "^Sunday: Poster Sessions, Pt.*$",
- "^Monday: Poster Sessions, Pt.*$",
- "^Wednesday: Poster Sessions, Pt.*",
- "^Tuesday: Poster Sessions, Pt.*$",
- "^Analýza reklamy$",
- "^Analysis of advertising$",
- "^Shōgaku shūshinsho$",
- "^Shōgaku sansū$",
- "^Shintei joshi kokubun$",
- "^Taishō joshi kokubun dokuhon$",
- "^Joshi kokubun$",
- "^Účetní uzávěrka a účetní závěrka v ČR$",
- "(?i)^The \"?Causes\"? of Cancer$",
- "^Normas para la publicación de artículos$",
- "^Editor('|s)(s|') [Rr]eply$",
- "^Editor(’|s)(s|’) letter$",
- "^Redaktoriaus žodis$",
- "^DISCUSSION ON THE PRECEDING PAPER$",
- "^Kōtō shōgaku shūshinsho jidōyō$",
- "^Shōgaku nihon rekishi$",
- "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
- "^Préface$",
- "^Occupational [Hh]ealth [Ss]ervices.$",
- "^In Memoriam Professor Toshiyuki TAKESHIMA$",
- "^Účetní závěrka ve vybraném podniku.*$",
- "^Financial statements in selected company$",
- "^Abdominal [Aa]ortic [Aa]neurysms.*$",
- "^Pseudomyxoma peritonei$",
- "^Kazalo autora$",
- "(?i)^uvodna riječ$",
- "^Motivace jako způsob vedení lidí$",
- "^Motivation as a leadership$",
- "^Polyfunkční dům$",
- "^Multi\\-funkcional building$",
- "^Podnikatelský plán$",
- "(?i)^Podnikatelský záměr$",
- "(?i)^Business Plan$",
- "^Oceňování nemovitostí$",
- "^Marketingová komunikace$",
- "^Marketing communication$",
- "^Sumario Analítico$",
- "^Riječ uredništva$",
- "^Savjetovanja i priredbe$",
- "^Índice$",
- "^(Starobosanski nadpisi).*$",
- "^Vzdělávání pracovníků v organizaci$",
- "^Staff training in organization$",
- "^(Life Histories of North American Geometridae).*$",
- "^Strategická analýza podniku$",
- "^Strategic Analysis of an Enterprise$",
- "^Sadržaj$",
- "^Upute suradnicima$",
- "^Rodinný dům$",
- "(?i)^Fami(l)?ly house$",
- "^Upute autorima$",
- "^Strategic Analysis$",
- "^Finanční analýza vybraného podniku$",
- "^Finanční analýza$",
- "^Riječ urednika$",
- "(?i)^Content(s?)$",
- "(?i)^Inhalt$",
- "^Jinjō shōgaku shūshinsho jidōyō$",
- "(?i)^Index$",
- "^Chūgaku kokubun kyōkasho$",
- "^Retrato de una mujer$",
- "^Retrato de un hombre$",
- "^Kōtō shōgaku dokuhon$",
- "^Shotōka kokugo$",
- "^Shōgaku dokuhon$",
- "^Jinjō shōgaku kokugo dokuhon$",
- "^Shinsei kokugo dokuhon$",
- "^Teikoku dokuhon$",
- "^Instructions to Authors$",
- "^KİTAP TAHLİLİ$",
- "^PRZEGLĄD PIŚMIENNICTWA$",
- "(?i)^Presentación$",
- "^İçindekiler$",
- "(?i)^Tabl?e of contents$",
- "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
- "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
- "^Editorial( Board)?$",
- "(?i)^Editorial \\(English\\)$",
- "^Editörden$",
- "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
- "^(Kiri Karl Morgensternile).*$",
- "^(\\[Eksliibris Aleksandr).*\\]$",
- "^(\\[Eksliibris Aleksandr).*$",
- "^(Eksliibris Aleksandr).*$",
- "^(Kiri A\\. de Vignolles).*$",
- "^(2 kirja Karl Morgensternile).*$",
- "^(Pirita kloostri idaosa arheoloogilised).*$",
- "^(Kiri tundmatule).*$",
- "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
- "^(Eksliibris Nikolai Birukovile).*$",
- "^(Eksliibris Nikolai Issakovile).*$",
- "^(WHP Cruise Summary Information of section).*$",
- "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
- "^(Measurement of the spin\\-dependent structure function).*",
- "(?i)^.*authors['’′]? reply\\.?$",
- "(?i)^.*authors['’′]? response\\.?$"
- ]
- },
- "synonyms": {}
- }
-}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 0310a3f44..fe158d9fc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -345,6 +345,22 @@
+
+
+ eu.dnetlib
+ dnet-actionmanager-common
+ [6.0.0,7.0.0)
+
+
+ commons-httpclient
+ commons-httpclient
+
+
+ eu.dnetlib
+ dnet-openaireplus-mapping-utils
+
+
+