From 6cb0a9bff07bc1404c4ff46815dc01ac5a7f66f6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Mar 2020 16:48:14 +0100 Subject: [PATCH] dedup wf directory structure aligned with project commons --- dhp-workflows/dhp-dedup/pom.xml | 4 - .../dnetlib/{ => dhp}/dedup/DatePicker.java | 2 +- .../{ => dhp}/dedup/DedupRecordFactory.java | 5 +- .../dnetlib/{ => dhp}/dedup/DedupUtility.java | 13 +- .../eu/dnetlib/{ => dhp}/dedup/Deduper.java | 3 +- .../{ => dhp}/dedup/OafEntityType.java | 2 +- .../dedup/SparkCreateConnectedComponent.java | 8 +- .../dedup/SparkCreateDedupRecord.java | 2 +- .../{ => dhp}/dedup/SparkCreateSimRels.java | 8 +- .../dedup/SparkPropagateRelation.java | 2 +- .../{ => dhp}/dedup/SparkReporter.java | 2 +- .../{ => dhp}/dedup/SparkUpdateEntity.java | 2 +- .../dedup/graph/ConnectedComponent.java | 4 +- .../dedup/graph/GraphProcessor.scala | 2 +- .../dnetlib/dhp/dedup/oozie_app/workflow.xml | 126 ---------- .../oozie_app/config-default.xml | 8 - .../oozie_app/workflow.xml} | 2 +- .../dedup/roots/oozie_app/config-default.xml | 18 ++ .../oozie_app/workflow.xml} | 6 +- .../dedup/scan/oozie_app/config-default.xml | 18 ++ .../oozie_app/workflow.xml} | 34 ++- .../{ => dhp}/dedup/MergeAuthorTest.java | 2 +- .../{ => dhp}/dedup/SparkCreateDedupTest.java | 16 +- .../{ => dhp}/dedup/jpath/JsonPathTest.java | 2 +- .../{ => dhp}/dedup/conf/org.curr.conf.json | 0 .../{ => dhp}/dedup/conf/pub.curr.conf.json | 0 .../dnetlib/{ => dhp}/dedup/conf/sample.json | 0 .../{ => dhp}/dedup/json/authors_merge.json | 0 .../job-override.properties | 10 +- pom.xml | 225 +++++++++--------- 30 files changed, 204 insertions(+), 322 deletions(-) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DatePicker.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DedupRecordFactory.java (98%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/DedupUtility.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/Deduper.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/OafEntityType.java (83%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateConnectedComponent.java (96%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateDedupRecord.java (98%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateSimRels.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkPropagateRelation.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkReporter.java (97%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/SparkUpdateEntity.java (99%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/graph/ConnectedComponent.java (95%) rename dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/{ => dhp}/dedup/graph/GraphProcessor.scala (96%) delete mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{ => relations}/oozie_app/config-default.xml (62%) rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/UpdateRelationsWf.xml => relations/oozie_app/workflow.xml} (96%) create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/BuildRootRecordsWf.xml => roots/oozie_app/workflow.xml} (95%) create mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml rename dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/{oozie_app/DuplicateScanWf.xml => scan/oozie_app/workflow.xml} (68%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/MergeAuthorTest.java (97%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/SparkCreateDedupTest.java (84%) rename dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/{ => dhp}/dedup/jpath/JsonPathTest.java (95%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/org.curr.conf.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/pub.curr.conf.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/conf/sample.json (100%) rename dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/{ => dhp}/dedup/json/authors_merge.json (100%) diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index cc27952fa..f39bf62f0 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -82,10 +82,6 @@ com.fasterxml.jackson.core jackson-core - - eu.dnetlib - dnet-actionmanager-common - diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java index 73f178edc..bd5c1118e 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Field; import org.apache.commons.lang.StringUtils; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java similarity index 98% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java index 5f81669e9..2fcac45fa 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java @@ -1,11 +1,9 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.lang.NotImplementedException; -import org.apache.commons.lang.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -16,7 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper; import scala.Tuple2; import java.util.Collection; -import java.util.Random; import static java.util.stream.Collectors.toMap; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java index ca390743e..3d505888a 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; @@ -13,15 +13,8 @@ import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -29,15 +22,11 @@ import org.dom4j.Element; import org.dom4j.io.SAXReader; import scala.Tuple2; -import java.io.IOException; import java.io.StringReader; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; public class DedupUtility { private static final Double THRESHOLD = 0.95; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java index 7206f892f..dda71fbcf 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java similarity index 83% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java index fb347ed51..66f0b3ce6 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; public enum OafEntityType { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java index 411913cdf..75b1dd01c 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java @@ -1,8 +1,8 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.dedup.graph.GraphProcessor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -83,7 +83,7 @@ public class SparkCreateConnectedComponent { } public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); + return Hashing.murmur3_128().hashString(id).asLong(); } private static SparkSession getSparkSession(ArgumentApplicationParser parser) { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java similarity index 98% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java index 77c8e04e9..0ce12d10a 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java index 4f25d620b..8298f9867 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; @@ -73,7 +73,10 @@ public class SparkCreateSimRels implements Serializable { JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); //save the simrel in the workingdir - spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); //create atomic actions JavaRDD> newSimRels = relationsRDD @@ -128,7 +131,6 @@ public class SparkCreateSimRels implements Serializable { .appName(SparkCreateSimRels.class.getSimpleName()) .master(parser.get("master")) .config(conf) - .enableHiveSupport() .getOrCreate(); } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java index 12d9f31b3..5c7be2817 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java index 165a10b25..c83a66e70 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.util.Reporter; import org.apache.commons.logging.Log; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java index 3fde1bdae..0c9890b03 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java index 27a61c02d..dd1a370c5 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java @@ -1,7 +1,7 @@ -package eu.dnetlib.dedup.graph; +package eu.dnetlib.dhp.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.dhp.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala index 38c695152..80b0b9ef4 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.graph +package eu.dnetlib.dhp.dedup.graph import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml deleted file mode 100644 index 5a00a5967..000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - sourcePath - the source path - - - entity - the entity that should be processed - - - dedupConf - the dedup Configuration - - - targetPath - the target path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Connected Components - eu.dnetlib.dedup.SparkCreateConnectedComponent - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --dedupPath${dedupPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml similarity index 62% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml index fcab9dd00..2e0ed9aee 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/config-default.xml @@ -15,12 +15,4 @@ oozie.action.sharelib.for.spark spark2 - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hive_db_name - openaire - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml index c4b17860e..749af6ecb 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/UpdateRelationsWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/relations/oozie_app/workflow.xml @@ -47,7 +47,7 @@ yarn-cluster cluster Update Relations - eu.dnetlib.dedup.SparkPropagateRelation + eu.dnetlib.dhp.dedup.SparkPropagateRelation dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml index 477e98791..457e62818 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/BuildRootRecordsWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/roots/oozie_app/workflow.xml @@ -56,7 +56,7 @@ yarn-cluster cluster Create Merge Relations - eu.dnetlib.dedup.SparkCreateConnectedComponent + eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf @@ -81,7 +81,7 @@ yarn-cluster cluster Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord + eu.dnetlib.dhp.dedup.SparkCreateDedupRecord dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf @@ -106,7 +106,7 @@ yarn-cluster cluster Create Dedup Record - eu.dnetlib.dedup.SparkUpdateEntity + eu.dnetlib.dhp.dedup.SparkUpdateEntity dhp-dedup-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml similarity index 68% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml index a685db1e8..01498ce04 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/DuplicateScanWf.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -34,6 +34,21 @@ + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + @@ -50,20 +65,19 @@ - ${jobTracker} - ${nameNode} - yarn-cluster + yarn cluster Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels + eu.dnetlib.dhp.dedup.SparkCreateSimRels dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" - -mtyarn-cluster + -mtyarn --i${graphBasePath} --o${rawSet} --la${isLookUpUrl} diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java index 817f2075c..e8bfd08fd 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.IOUtils; diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java similarity index 84% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index 09f8a0fd6..47e446e7a 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; @@ -74,19 +74,9 @@ public class SparkCreateDedupTest { final HashFunction hashFunction = Hashing.murmur3_128(); System.out.println( s1.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); + System.out.println(hashFunction.hashString(s1).asLong()); System.out.println( s2.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); - } - - @Test - public void testJoinEntities() throws Exception{ - SparkJoinEntities.main(new String[] { - "-mt", "local[*]", - "-i", "/tmp/dedup", - "-w", "/tmp/dedup", - "-o", "/tmp/dedup", - }); + System.out.println(hashFunction.hashString(s2).asLong()); } } diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java index 7a63cfe24..8a88896fc 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.jpath; +package eu.dnetlib.dhp.dedup.jpath; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 68816c224..8230dfc18 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,12 +1,14 @@ -sparkDriverMemory=10G -sparkExecutorMemory=15G +sparkExecutorCoresForJoining=1 +sparkDriverMemoryForJoining=10G +sparkExecutorMemoryForJoining=15G +sparkExecutorCoresForIndexing=64 +sparkDriverMemoryForIndexing=3G +sparkExecutorMemoryForIndexing=2G #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 outputPath=/tmp/openaire_provision format=TMF batchSize=2000 -sparkExecutorCoresForJoining=128 -sparkExecutorCoresForIndexing=64 reuseRecords=false otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource \ No newline at end of file diff --git a/pom.xml b/pom.xml index fe158d9fc..1ae078128 100644 --- a/pom.xml +++ b/pom.xml @@ -1,6 +1,6 @@ + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 eu.dnetlib.dhp @@ -101,12 +101,12 @@ org.apache.hadoop hadoop-common - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client ${dhp.hadoop.version} provided @@ -148,6 +148,13 @@ ${dhp.commons.lang.version} + + com.google.guava + guava + ${dhp.guava.version} + + + commons-codec commons-codec @@ -167,11 +174,11 @@ provided - - net.sf.saxon - Saxon-HE - 9.9.1-6 - + + net.sf.saxon + Saxon-HE + 9.9.1-6 + dom4j @@ -192,56 +199,56 @@ - com.mycila.xmltool - xmltool - 3.3 - + com.mycila.xmltool + xmltool + 3.3 + - - org.apache.solr - solr-solrj - 7.5.0 - - - * - * - - - - - com.lucidworks.spark - spark-solr - 3.6.0 - - - * - * - - - + + org.apache.solr + solr-solrj + 7.5.0 + + + * + * + + + + + com.lucidworks.spark + spark-solr + 3.6.0 + + + * + * + + + - - org.apache.httpcomponents - httpclient - 4.5.3 - - - org.apache.httpcomponents - httpmime - 4.5.3 - - - org.noggit - noggit - 0.8 - - - org.apache.zookeeper - zookeeper - 3.4.11 - + + org.apache.httpcomponents + httpclient + 4.5.3 + + + org.apache.httpcomponents + httpmime + 4.5.3 + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + - + net.schmizz sshj 0.10.0 @@ -283,17 +290,17 @@ dnet-pace-core 4.0.0 - - eu.dnetlib - cnr-rmi-api - [2.0.0,3.0.0) - + + eu.dnetlib + cnr-rmi-api + [2.0.0,3.0.0) + - - org.apache.cxf - cxf-rt-transports-http - 3.1.5 - + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + javax.persistence javax.persistence-api @@ -301,36 +308,36 @@ provided - - com.rabbitmq - amqp-client - 5.6.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - - - org.mongodb - mongo-java-driver - ${mongodb.driver.version} - - - org.antlr - stringtemplate - 4.0 - + + com.rabbitmq + amqp-client + 5.6.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + + + org.antlr + stringtemplate + 4.0 + org.apache.oozie @@ -345,22 +352,6 @@ - - - eu.dnetlib - dnet-actionmanager-common - [6.0.0,7.0.0) - - - commons-httpclient - commons-httpclient - - - eu.dnetlib - dnet-openaireplus-mapping-utils - - - @@ -512,9 +503,9 @@ 2.4.0.cloudera2 2.9.6 3.5 + 11.0.2 2.11.12 4.12 3.4.2 -