diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 6e656034bf..ec7c14e906 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -52,6 +52,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -76,11 +78,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} @@ -159,7 +161,7 @@ eu.dnetlib.dhp - dhp-schemas + dhp-schemas_${scala.binary.version} diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index fac087ac70..a2525128bb 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -20,7 +20,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -39,8 +39,9 @@ + true + ${scala.binary.version} ${scala.version} - -target:jvm-1.8 @@ -98,11 +99,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index 87126920ba..13ffe8b180 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -11,12 +11,12 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 53d349d2a1..acd04901d7 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -38,6 +38,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -54,11 +56,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 479a9e8c63..7ecc8b35d2 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -16,11 +16,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 01f1ea3211..322fc7e93d 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -18,11 +18,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.elasticsearch diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index 750cc70281..1cadae4608 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -13,7 +13,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -32,8 +32,9 @@ + true + ${scala.binary.version} ${scala.version} - -target:jvm-1.8 @@ -58,38 +59,67 @@ eu.dnetlib.dhp dhp-common ${project.version} + + + log4j + log4j + + + annotations + org.jetbrains + + + slf4j-api + org.slf4j + + eu.dnetlib.dhp dhp-pace-core ${project.version} + + + jsr305 + com.google.code.findbugs + + + javassist + org.javassist + + + + + + org.apache.commons + commons-lang3 org.scala-lang.modules - scala-java8-compat_2.11 + scala-java8-compat_${scala.binary.version} 1.0.2 org.scala-lang.modules - scala-collection-compat_2.11 - 2.8.0 + scala-collection-compat_${scala.binary.version} + 2.11.0 org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.apache.spark - spark-graphx_2.11 + spark-graphx_${scala.binary.version} @@ -107,12 +137,6 @@ jaxen - - com.influxdb - influxdb-client-java - 3.1.0 - - com.fasterxml.jackson.core jackson-databind diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkSimRelsAnalytics.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkSimRelsAnalytics.java new file mode 100644 index 0000000000..18f8dd92d7 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkSimRelsAnalytics.java @@ -0,0 +1,118 @@ + +package eu.dnetlib.dhp.oa.dedup; + +import java.io.IOException; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.application.dedup.log.DedupLogModel; +import eu.dnetlib.dhp.application.dedup.log.DedupLogWriter; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.SparkDedupConfig; + +public class SparkSimRelsAnalytics extends AbstractSparkAction { + + private static final Logger log = LoggerFactory.getLogger(SparkSimRelsAnalytics.class); + + public SparkSimRelsAnalytics(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + spark.sparkContext().setLogLevel("WARN"); + } + + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkSimRelsAnalytics.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + new SparkSimRelsAnalytics(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } + + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException, SAXException { + + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("numPartitions: '{}'", numPartitions); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); + + final String dfLogPath = parser.get("dataframeLog"); + final String runTag = Optional.ofNullable(parser.get("runTAG")).orElse("UNKNOWN"); + + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + final long start = System.currentTimeMillis(); + + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating simrels for: '{}'", subEntity); + + final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + SparkDedupConfig sparkConfig = new SparkDedupConfig(dedupConf, numPartitions); + + spark.udf().register("collect_sort_slice", sparkConfig.collectSortSliceUDAF()); + + Dataset simRels = spark + .read() + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .transform(sparkConfig.modelExtractor()) // Extract fields from input json column according to model + // definition + .transform(sparkConfig.generateClustersWithWindows()) // generate pairs according to + // filters, clusters, and model + // definition + .transform(sparkConfig.processClusters()) // process blocks and emits pairs of found + // similarities + .map( + (MapFunction) t -> DedupUtility + .createSimRel(t.getStruct(0).getString(0), t.getStruct(0).getString(1), entity), + Encoders.bean(Relation.class)); + + saveParquet(simRels, outputPath, SaveMode.Overwrite); + final long end = System.currentTimeMillis(); + if (StringUtils.isNotBlank(dfLogPath)) { + final DedupLogModel model = new DedupLogModel(runTag, dedupConf.toString(), subEntity, start, end, + end - start); + new DedupLogWriter(dfLogPath).appendLog(model, spark); + + } + + } + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml index 2e0ed9aeea..862f568c96 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml @@ -13,6 +13,6 @@ oozie.action.sharelib.for.spark - spark2 + spark342 \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml index ba2270c8a8..4fd1990b4d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml @@ -126,15 +126,25 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=15000 + --conf spark.sql.shuffle.partitions=5000 + --conf spark.driver.extraJavaOptions="-Xss256k" + --conf spark.executor.extraJavaOptions="-Dlog4j.configuration=spark-log4j.properties -Xss256k" + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.minExecutors=100 --conf spark.dynamicAllocation.shuffleTracking.enabled=true + --conf spark.network.io.preferDirectBufs=true --conf spark.memory.fraction=0.4 --conf spark.sql.adaptive.coalescePartitions.minPartitionNum=5000 + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.shuffle.service.enabled=true --conf spark.eventLog.enabled=true + --conf spark.executor.heartbeatInterval=60s + --conf spark.network.timeout=640s + --conf spark.sql.legacy.allowUntypedScalaUDF=true --graphBasePath${graphBasePath} --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --workingPath${workingPath} - --numPartitions15000 + --numPartitions5000 - + diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index ef7cc656cf..00c9d5f27b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -95,6 +95,7 @@ public class SparkDedupTest implements Serializable { final SparkConf conf = new SparkConf(); conf.set("spark.sql.shuffle.partitions", "200"); + conf.set("spark.sql.legacy.allowUntypedScalaUDF", "true"); spark = SparkSession .builder() .appName(SparkDedupTest.class.getSimpleName()) diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 37accbc4f5..6e8911fbab 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -33,6 +33,8 @@ + true + ${scala.binary.version} ${scala.version} @@ -70,12 +72,12 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index 591cad252c..9698dee03c 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -12,11 +12,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} @@ -27,7 +27,7 @@ org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} test diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index f579a7d2bf..ef35951c00 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -14,7 +14,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -37,6 +37,8 @@ -Xmax-classfile-name 200 + true + ${scala.binary.version} ${scala.version} @@ -64,15 +66,15 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} test @@ -125,7 +127,7 @@ org.json4s - json4s-jackson_2.11 + json4s-jackson_${scala.binary.version} diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 413cc8cdd6..e62fcdf198 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -14,7 +14,7 @@ net.alchim31.maven scala-maven-plugin - 4.0.1 + ${net.alchim31.maven.version} scala-compile-first @@ -37,6 +37,8 @@ -Xmax-classfile-name 200 + true + ${scala.binary.version} ${scala.version} @@ -48,11 +50,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} com.jayway.jsonpath diff --git a/dhp-workflows/dhp-stats-promote/pom.xml b/dhp-workflows/dhp-stats-promote/pom.xml index ce3e739a5c..9e17a78dcb 100644 --- a/dhp-workflows/dhp-stats-promote/pom.xml +++ b/dhp-workflows/dhp-stats-promote/pom.xml @@ -10,11 +10,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 2bc610c420..f491b58689 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -10,11 +10,11 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 954c8bd39b..a9dbb09ae1 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -46,13 +46,11 @@ org.apache.spark - spark-core_2.11 - 2.2.0 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 - 2.4.5 + spark-sql_${scala.binary.version} com.googlecode.json-simple diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index 54e18580b3..56aec73b78 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -46,13 +46,11 @@ org.apache.spark - spark-core_2.11 - 2.2.0 + spark-core_${scala.binary.version} org.apache.spark - spark-sql_2.11 - 2.4.5 + spark-sql_${scala.binary.version} com.googlecode.json-simple diff --git a/pom.xml b/pom.xml index 4707f7d017..9763554531 100644 --- a/pom.xml +++ b/pom.xml @@ -142,7 +142,7 @@ eu.dnetlib.dhp - dhp-schemas + dhp-schemas_${scala.binary.version} ${dhp-schemas.version} @@ -171,25 +171,25 @@ org.apache.spark - spark-core_2.11 + spark-core_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-sql_2.11 + spark-sql_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-graphx_2.11 + spark-graphx_${scala.binary.version} ${dhp.spark.version} provided org.apache.spark - spark-hive_2.11 + spark-hive_${scala.binary.version} ${dhp.spark.version} test @@ -295,7 +295,7 @@ com.lucidworks.spark spark-solr - 3.6.0 + 4.0.2 * @@ -523,7 +523,7 @@ org.json4s - json4s-jackson_2.11 + json4s-jackson_${scala.binary.version} ${json4s.version} @@ -699,7 +699,7 @@ org.antipathy - mvn-scalafmt_2.11 + mvn-scalafmt_${scala.binary.version} 1.0.1640073709.733712b @@ -756,7 +756,7 @@ org.antipathy - mvn-scalafmt_2.11 + mvn-scalafmt_${scala.binary.version} https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf false @@ -865,17 +865,18 @@ cdh5.9.2 2.6.0-${dhp.cdh.version} 4.1.0-${dhp.cdh.version} - 2.4.0.cloudera2 - 2.9.6 + 3.4.1 + 2.14.2 3.5 true 11.0.2 - 2.11.12 + 2.12.18 + 2.12 5.6.1 3.3.3 3.4.2 [2.12,3.0) - [3.17.1] + 3.17.2-SNAPSHOT [4.0.3] [6.0.5] [3.1.6] @@ -883,13 +884,13 @@ 7.5.0 4.7.2 1.20 - 3.5.3 + 3.7.0-M11 4.13.0 1.8 4.1.2 1.8 4.5.3 - 4.0.1 + 4.8.1 2.2.2 1.1.3 3.2.1