diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml
index cc27952fa..f39bf62f0 100644
--- a/dhp-workflows/dhp-dedup/pom.xml
+++ b/dhp-workflows/dhp-dedup/pom.xml
@@ -82,10 +82,6 @@
com.fasterxml.jackson.core
jackson-core
-
- eu.dnetlib
- dnet-actionmanager-common
-
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java
similarity index 99%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java
index 73f178edc..bd5c1118e 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Field;
import org.apache.commons.lang.StringUtils;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java
similarity index 98%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java
index 5f81669e9..2fcac45fa 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java
@@ -1,11 +1,9 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
-import org.apache.commons.lang.NotImplementedException;
-import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.util.Collection;
-import java.util.Random;
import static java.util.stream.Collectors.toMap;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java
similarity index 95%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java
index ca390743e..3d505888a 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
@@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@@ -29,15 +22,11 @@ import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import scala.Tuple2;
-import java.io.IOException;
import java.io.StringReader;
-import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
public class DedupUtility {
private static final Double THRESHOLD = 0.95;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java
similarity index 99%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java
index 7206f892f..dda71fbcf 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java
@@ -1,7 +1,6 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java
similarity index 83%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java
index fb347ed51..66f0b3ce6 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
public enum OafEntityType {
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java
similarity index 96%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java
index 411913cdf..75b1dd01c 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java
@@ -1,8 +1,8 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.google.common.hash.Hashing;
-import eu.dnetlib.dedup.graph.ConnectedComponent;
-import eu.dnetlib.dedup.graph.GraphProcessor;
+import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
+import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent {
}
public static long getHashcode(final String id) {
- return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
+ return Hashing.murmur3_128().hashString(id).asLong();
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java
similarity index 98%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java
index 77c8e04e9..0ce12d10a 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java
similarity index 95%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java
index 4f25d620b..8298f9867 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable {
JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
//save the simrel in the workingdir
- spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
+ spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class))
+ .write()
+ .mode("overwrite")
+ .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
//create atomic actions
JavaRDD> newSimRels = relationsRDD
@@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable {
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
- .enableHiveSupport()
.getOrCreate();
}
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java
similarity index 99%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java
index 12d9f31b3..5c7be2817 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java
similarity index 97%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java
index 165a10b25..c83a66e70 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java
similarity index 99%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java
index 3fde1bdae..0c9890b03 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java
similarity index 95%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java
index 27a61c02d..dd1a370c5 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java
@@ -1,7 +1,7 @@
-package eu.dnetlib.dedup.graph;
+package eu.dnetlib.dhp.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dedup.DedupUtility;
+import eu.dnetlib.dhp.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala
similarity index 96%
rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala
index 38c695152..80b0b9ef4 100644
--- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup.graph
+package eu.dnetlib.dhp.dedup.graph
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
deleted file mode 100644
index 5a00a5967..000000000
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml
+++ /dev/null
@@ -1,126 +0,0 @@
-
-
-
- sourcePath
- the source path
-
-
- entity
- the entity that should be processed
-
-
- dedupConf
- the dedup Configuration
-
-
- targetPath
- the target path
-
-
- sparkDriverMemory
- memory for driver process
-
-
- sparkExecutorMemory
- memory for individual executor
-
-
- sparkExecutorCores
- number of cores used by single executor
-
-
-
-
-
-
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
-
-
-
-
-
-
-
-
- ${jobTracker}
- ${nameNode}
- yarn-cluster
- cluster
- Create Similarity Relations
- eu.dnetlib.dedup.SparkCreateSimRels
- dhp-dedup-${projectVersion}.jar
- --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory} --conf
- spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
- spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
- spark.sql.warehouse.dir="/user/hive/warehouse"
-
- -mtyarn-cluster
- --sourcePath${sourcePath}
- --targetPath${targetPath}
- --entity${entity}
- --dedupConf${dedupConf}
-
-
-
-
-
-
-
-
- ${jobTracker}
- ${nameNode}
- yarn-cluster
- cluster
- Create Connected Components
- eu.dnetlib.dedup.SparkCreateConnectedComponent
- dhp-dedup-${projectVersion}.jar
- --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory} --conf
- spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
- spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
- spark.sql.warehouse.dir="/user/hive/warehouse"
-
- -mtyarn-cluster
- --sourcePath${sourcePath}
- --targetPath${targetPath}
- --entity${entity}
- --dedupConf${dedupConf}
-
-
-
-
-
-
-
- ${jobTracker}
- ${nameNode}
- yarn-cluster
- cluster
- Create Dedup Record
- eu.dnetlib.dedup.SparkCreateDedupRecord
- dhp-dedup-${projectVersion}.jar
- --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory} --conf
- spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
- spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
- spark.sql.warehouse.dir="/user/hive/warehouse"
-
- -mtyarn-cluster
- --sourcePath${sourcePath}
- --dedupPath${dedupPath}
- --entity${entity}
- --dedupConf${dedupConf}
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml
similarity index 62%
rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml
index fcab9dd00..2e0ed9aee 100644
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml
@@ -15,12 +15,4 @@
oozie.action.sharelib.for.spark
spark2
-
- hive_metastore_uris
- thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
-
-
- hive_db_name
- openaire
-
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml
similarity index 96%
rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml
rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml
index c4b17860e..749af6ecb 100644
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml
@@ -47,7 +47,7 @@
yarn-cluster
cluster
Update Relations
- eu.dnetlib.dedup.SparkPropagateRelation
+ eu.dnetlib.dhp.dedup.SparkPropagateRelation
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml
new file mode 100644
index 000000000..2e0ed9aee
--- /dev/null
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml
similarity index 95%
rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml
rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml
index 477e98791..457e62818 100644
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml
@@ -56,7 +56,7 @@
yarn-cluster
cluster
Create Merge Relations
- eu.dnetlib.dedup.SparkCreateConnectedComponent
+ eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
@@ -81,7 +81,7 @@
yarn-cluster
cluster
Create Dedup Record
- eu.dnetlib.dedup.SparkCreateDedupRecord
+ eu.dnetlib.dhp.dedup.SparkCreateDedupRecord
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
@@ -106,7 +106,7 @@
yarn-cluster
cluster
Create Dedup Record
- eu.dnetlib.dedup.SparkUpdateEntity
+ eu.dnetlib.dhp.dedup.SparkUpdateEntity
dhp-dedup-${projectVersion}.jar
--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --conf
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml
new file mode 100644
index 000000000..2e0ed9aee
--- /dev/null
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml
similarity index 68%
rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml
rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml
index a685db1e8..01498ce04 100644
--- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml
+++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml
@@ -34,6 +34,21 @@
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
@@ -50,20 +65,19 @@
- ${jobTracker}
- ${nameNode}
- yarn-cluster
+ yarn
cluster
Create Similarity Relations
- eu.dnetlib.dedup.SparkCreateSimRels
+ eu.dnetlib.dhp.dedup.SparkCreateSimRels
dhp-dedup-${projectVersion}.jar
- --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory} --conf
- spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf
- spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf
- spark.sql.warehouse.dir="/user/hive/warehouse"
+
+ --executor-memory ${sparkExecutorMemory}
+ --executor-cores ${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
- -mtyarn-cluster
+ -mtyarn
--i${graphBasePath}
--o${rawSet}
--la${isLookUpUrl}
diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java
similarity index 97%
rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java
index 817f2075c..e8bfd08fd 100644
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.IOUtils;
diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java
similarity index 84%
rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java
index 09f8a0fd6..47e446e7a 100644
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup;
+package eu.dnetlib.dhp.dedup;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
@@ -74,19 +74,9 @@ public class SparkCreateDedupTest {
final HashFunction hashFunction = Hashing.murmur3_128();
System.out.println( s1.hashCode());
- System.out.println(hashFunction.hashUnencodedChars(s1).asLong());
+ System.out.println(hashFunction.hashString(s1).asLong());
System.out.println( s2.hashCode());
- System.out.println(hashFunction.hashUnencodedChars(s2).asLong());
- }
-
- @Test
- public void testJoinEntities() throws Exception{
- SparkJoinEntities.main(new String[] {
- "-mt", "local[*]",
- "-i", "/tmp/dedup",
- "-w", "/tmp/dedup",
- "-o", "/tmp/dedup",
- });
+ System.out.println(hashFunction.hashString(s2).asLong());
}
}
diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java
similarity index 95%
rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java
rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java
index 7a63cfe24..8a88896fc 100644
--- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java
+++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java
@@ -1,4 +1,4 @@
-package eu.dnetlib.dedup.jpath;
+package eu.dnetlib.dhp.dedup.jpath;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
similarity index 100%
rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json
rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
similarity index 100%
rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json
rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json
similarity index 100%
rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json
rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json
diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json
similarity index 100%
rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json
rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json
diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties
index 68816c224..8230dfc18 100644
--- a/dhp-workflows/dhp-graph-provision/job-override.properties
+++ b/dhp-workflows/dhp-graph-provision/job-override.properties
@@ -1,12 +1,14 @@
-sparkDriverMemory=10G
-sparkExecutorMemory=15G
+sparkExecutorCoresForJoining=1
+sparkDriverMemoryForJoining=10G
+sparkExecutorMemoryForJoining=15G
+sparkExecutorCoresForIndexing=64
+sparkDriverMemoryForIndexing=3G
+sparkExecutorMemoryForIndexing=2G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision
format=TMF
batchSize=2000
-sparkExecutorCoresForJoining=128
-sparkExecutorCoresForIndexing=64
reuseRecords=false
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index fe158d9fc..1ae078128 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,6 +1,6 @@
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
4.0.0
eu.dnetlib.dhp
@@ -101,12 +101,12 @@
org.apache.hadoop
hadoop-common
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-client
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client
${dhp.hadoop.version}
provided
@@ -148,6 +148,13 @@
${dhp.commons.lang.version}
+
+ com.google.guava
+ guava
+ ${dhp.guava.version}
+
+
+
commons-codec
commons-codec
@@ -167,11 +174,11 @@
provided
-
- net.sf.saxon
- Saxon-HE
- 9.9.1-6
-
+
+ net.sf.saxon
+ Saxon-HE
+ 9.9.1-6
+
dom4j
@@ -192,56 +199,56 @@
- com.mycila.xmltool
- xmltool
- 3.3
-
+ com.mycila.xmltool
+ xmltool
+ 3.3
+
-
- org.apache.solr
- solr-solrj
- 7.5.0
-
-
- *
- *
-
-
-
-
- com.lucidworks.spark
- spark-solr
- 3.6.0
-
-
- *
- *
-
-
-
+
+ org.apache.solr
+ solr-solrj
+ 7.5.0
+
+
+ *
+ *
+
+
+
+
+ com.lucidworks.spark
+ spark-solr
+ 3.6.0
+
+
+ *
+ *
+
+
+
-
- org.apache.httpcomponents
- httpclient
- 4.5.3
-
-
- org.apache.httpcomponents
- httpmime
- 4.5.3
-
-
- org.noggit
- noggit
- 0.8
-
-
- org.apache.zookeeper
- zookeeper
- 3.4.11
-
+
+ org.apache.httpcomponents
+ httpclient
+ 4.5.3
+
+
+ org.apache.httpcomponents
+ httpmime
+ 4.5.3
+
+
+ org.noggit
+ noggit
+ 0.8
+
+
+ org.apache.zookeeper
+ zookeeper
+ 3.4.11
+
-
+
net.schmizz
sshj
0.10.0
@@ -283,17 +290,17 @@
dnet-pace-core
4.0.0
-
- eu.dnetlib
- cnr-rmi-api
- [2.0.0,3.0.0)
-
+
+ eu.dnetlib
+ cnr-rmi-api
+ [2.0.0,3.0.0)
+
-
- org.apache.cxf
- cxf-rt-transports-http
- 3.1.5
-
+
+ org.apache.cxf
+ cxf-rt-transports-http
+ 3.1.5
+
javax.persistence
javax.persistence-api
@@ -301,36 +308,36 @@
provided
-
- com.rabbitmq
- amqp-client
- 5.6.0
-
-
- com.jayway.jsonpath
- json-path
- 2.4.0
-
-
- com.arakelian
- java-jq
- 0.10.1
-
-
- edu.cmu
- secondstring
- 1.0.0
-
-
- org.mongodb
- mongo-java-driver
- ${mongodb.driver.version}
-
-
- org.antlr
- stringtemplate
- 4.0
-
+
+ com.rabbitmq
+ amqp-client
+ 5.6.0
+
+
+ com.jayway.jsonpath
+ json-path
+ 2.4.0
+
+
+ com.arakelian
+ java-jq
+ 0.10.1
+
+
+ edu.cmu
+ secondstring
+ 1.0.0
+
+
+ org.mongodb
+ mongo-java-driver
+ ${mongodb.driver.version}
+
+
+ org.antlr
+ stringtemplate
+ 4.0
+
org.apache.oozie
@@ -345,22 +352,6 @@
-
-
- eu.dnetlib
- dnet-actionmanager-common
- [6.0.0,7.0.0)
-
-
- commons-httpclient
- commons-httpclient
-
-
- eu.dnetlib
- dnet-openaireplus-mapping-utils
-
-
-
@@ -512,9 +503,9 @@
2.4.0.cloudera2
2.9.6
3.5
+ 11.0.2
2.11.12
4.12
3.4.2
-