diff --git a/.gitignore b/.gitignore
index 14cd4d345..6fafc7055 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ spark-warehouse
/**/.factorypath
/**/.scalafmt.conf
/.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
index 84b962b4b..eddcd8867 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder;
// execute
- Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+ try {
+ mojo.execute();
+ Assertions.assertTrue(false); // not reached
+ } catch (Exception e) {
+ Assertions
+ .assertTrue(
+ MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+ IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+ }
}
@Test
diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c2f76cff7..2c7a0ef8c 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -70,10 +70,7 @@
com.ibm.icu
icu4j
-
- org.apache.hadoop
- hadoop-common
-
+
com.github.sisyphsu
dateparser
@@ -163,7 +160,7 @@
eu.dnetlib.dhp
- ${dhp-schemas.artifact}
+ dhp-schemas
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
index fac9a7565..fbf586f8c 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
- } catch (IOException e) {
+ } catch (Exception e) {
throw new RuntimeException(e);
}
}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
index 544da78f5..fe4f58f06 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
- * @throws IOException
- * @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1);
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
index 342d73cdc..78bb99e79 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
index 98e8daa18..4f0cee53d 100644
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@@ -154,5 +154,13 @@
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
+ },
+ "isamongtopnsimilardocuments": {
+ "original": "IsAmongTopNSimilarDocuments",
+ "inverse": "HasAmongTopNSimilarDocuments"
+ },
+ "hasamongtopnsimilardocuments": {
+ "original": "HasAmongTopNSimilarDocuments",
+ "inverse": "IsAmongTopNSimilarDocuments"
}
}
\ No newline at end of file
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
index a14c25837..526bbd295 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
- SparkSession
+ val b = SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(master)
- .getOrCreate()
+ if (master != null)
+ b.master(master)
+ b.getOrCreate()
}
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
index a995016a8..72a17777e 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
}
def generateScholixResourceFromResult(r: Result): ScholixResource = {
- generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+ val sum = ScholixUtils.resultToSummary(r)
+ if (sum != null)
+ generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+ else
+ null
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
}
+ def invRel(rel: String): String = {
+ val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+ if (semanticRelation != null)
+ semanticRelation.inverse
+ else
+ null
+ }
+
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
- if (r.isInstanceOf[Publication])
- s.setTypology(Typology.publication)
- else
- s.setTypology(Typology.dataset)
+// s.setTypology(r.getResulttype.getClassid)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index 6c706b692..52ec2a253 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -24,7 +24,7 @@
scala-compile-first
- initialize
+ process-resources
add-source
compile
@@ -59,14 +59,6 @@
edu.cmu
secondstring
-
- com.google.guava
- guava
-
-
- com.google.code.gson
- gson
-
org.apache.commons
commons-lang3
@@ -91,10 +83,6 @@
com.fasterxml.jackson.core
jackson-databind
-
- org.apache.commons
- commons-math3
-
com.jayway.jsonpath
json-path
@@ -113,4 +101,90 @@
+
+
+ spark-24
+
+ true
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-34
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-35
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 6bfb8b3f4..b055077d8 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -1,12 +1,6 @@
package eu.dnetlib.pace.common;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
@@ -15,6 +9,13 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
/**
* Set of common functions for the framework
*
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
index aa04188da..e6a1c4ccc 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
- df.map(r => rowFromJson(r))(RowEncoder(schema))
+ df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
}
def rowFromJson(json: String): Row = {
diff --git a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 000000000..a426703d6
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+ def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+ RowEncoder(schema)
+ }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 000000000..cbc454ae2
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+ def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+ ExpressionEncoder(schema)
+ }
+}
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
index 93db552c1..be5c1ebb9 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest {
diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
new file mode 100644
index 000000000..d8e17ed46
--- /dev/null
+++ b/dhp-shade-package/pom.xml
@@ -0,0 +1,169 @@
+
+
+ 4.0.0
+
+ eu.dnetlib.dhp
+ dhp
+ 1.2.5-SNAPSHOT
+ ../pom.xml
+
+
+
+ dhp-shade-package
+ jar
+
+
+
+ DHPSite
+ ${dhp.site.stage.path}/dhp-common
+
+
+
+ This module create a jar of all module dependencies
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-actionmanager
+ ${project.version}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-graph-mapper
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-graph-provision
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-impact-indicators
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-actionsets
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-hist-snaps
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-monitor-irish
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-promote
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-update
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-swh
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-usage-raw-data-update
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-usage-stats-build
+ ${project.version}
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ package
+
+ shade
+
+
+
+
+ eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels
+
+
+
+
+ META-INF/cxf/bus-extensions.txt
+
+
+
+
+ *:*
+
+ META-INF/maven/**
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+ com
+ repackaged.com.google.common
+
+ com.google.common.**
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
index c28d81992..c10eb5c8c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@@ -9,6 +9,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
@@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
.union(openAPCRelations)
.union(dataciteRelations)
.saveAsHadoopFile(
- outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+ outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
});
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index 040c89782..c1e0c4d68 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
@@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
resultsRDD
.union(projectsRDD)
.saveAsHadoopFile(
- outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+ outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
});
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
index 44c82e256..c4aa64fd4 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2)
case "print" => journal.setIssnPrinted(tp._2)
+ case _ =>
}
})
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
index c415dd9a4..b065db334 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@@ -94,7 +94,8 @@ object MagUtility extends Serializable {
)
di
}
-val datatypedict = Map(
+
+ val datatypedict = Map(
"bool" -> BooleanType,
"int" -> IntegerType,
"uint" -> IntegerType,
@@ -505,8 +506,6 @@ val datatypedict = Map(
)
)
-
-
result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@@ -601,17 +600,17 @@ val datatypedict = Map(
if (paper.doi.orNull != null) {
pidList = pidList ::: List(
- structuredProperty(
- paper.doi.get,
- qualifier(
- PidType.doi.toString,
- PidType.doi.toString,
- ModelConstants.DNET_PID_TYPES,
- ModelConstants.DNET_PID_TYPES
- ),
- null
- )
+ structuredProperty(
+ paper.doi.get,
+ qualifier(
+ PidType.doi.toString,
+ PidType.doi.toString,
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES
+ ),
+ null
)
+ )
}
instance.setPid(pidList.asJava)
result.setPid(pidList.asJava)
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
index 123d8e0f8..208a1dc66 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@@ -35,8 +35,6 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
import spark.implicits._
-
-
spark.read
.load(s"$magBasePath/mag_denormalized")
.as[MAGPaper]
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
index 639918151..11d087583 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
@@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
import org.slf4j.{Logger, LoggerFactory}
-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory
object SparkCreateBaselineDataFrame {
@@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
} else
- return IOUtils.toString(response.getEntity.getContent)
+ return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
@@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
- )
+ ),
+ Charset.defaultCharset()
)
)
parser.parseArgument(args)
@@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
val workingPath = parser.get("workingPath")
log.info("workingPath: {}", workingPath)
- val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
- log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
- val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
- val outputBasePath = cleanedMdStoreVersion.getHdfsPath
- log.info("outputBasePath: {}", outputBasePath)
+ val targetPath = parser.get("targetPath")
+ log.info("targetPath: {}", targetPath)
val hdfsServerUri = parser.get("hdfsServerUri")
- log.info("hdfsServerUri: {}", hdfsServerUri)
+ log.info("hdfsServerUri: {}", targetPath)
val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate)
@@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+ val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => {
- val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+ val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml)
})
)
@@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
.map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf]
.filter(p => p != null),
- s"$outputBasePath/$MDSTORE_DATA_PATH"
+ targetPath
)
- val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
- val mdStoreSize = df.count
- writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
index 9102c12c4..fb941a461 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@@ -1,7 +1,8 @@
package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml
*/
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
index ce116688a..0a4dfc00b 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
index 3b416caf2..ebde0ed0c 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@@ -119,7 +119,9 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI",
"-outputPath",
workingDir.toString() + "/COCI_json/",
- "-inputFile", "input1;input2;input3;input4;input5"
+ "-inputFile", "input1;input2;input3;input4;input5",
+ "-format",
+ "COCI"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
index 2a9e391df..2f1af2a6e 100644
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@@ -789,10 +789,6 @@
"value": "2227-9717",
"type": "electronic"
},
- {
- "value": "VALUE",
- "type": "PIPPO"
- },
{
"value": "1063-4584",
"type": "pu"
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
index ed43bb1a1..c3ea884eb 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
import org.junit.jupiter.api.extension.ExtendWith
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
@@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
super.setUpVocabulary()
}
+ @Test
+ def mappingRecord(): Unit = {
+ val input =
+ IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+ println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+ }
+
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
index 3ae25decb..77812affb 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@@ -7,13 +7,10 @@ import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
-
-
class MAGMappingTest {
val mapper = new ObjectMapper()
-
def mappingTest(): Unit = {
val spark = SparkSession
@@ -26,8 +23,6 @@ class MAGMappingTest {
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
}
-
-
@Test
def mappingMagType(): Unit = {
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index d1611300d..c4af14c40 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.io.Source
@@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testEBIData() = {
- val inputXML = Source
- .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- .mkString
- val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+ val inputFactory = XMLInputFactory.newInstance
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
@@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testParsingPubmedXML(): Unit = {
- val xml = new XMLEventReader(
- Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- )
+ val inputFactory = XMLInputFactory.newInstance
+
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
val parser = new PMParser(xml)
parser.foreach(checkPMArticle)
}
@@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testPubmedMapping(): Unit = {
- val xml = new XMLEventReader(
- Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- )
+ val inputFactory = XMLInputFactory.newInstance
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
val parser = new PMParser(xml)
val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml
index 8665ebd05..897fa1a76 100644
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@@ -53,24 +53,10 @@
dhp-pace-core
${project.version}
-
org.apache.commons
commons-lang3
-
-
- org.scala-lang.modules
- scala-java8-compat_${scala.binary.version}
- 1.0.2
-
-
-
- org.scala-lang.modules
- scala-collection-compat_${scala.binary.version}
- 2.11.0
-
-
org.apache.spark
spark-core_${scala.binary.version}
@@ -79,16 +65,10 @@
org.apache.spark
spark-sql_${scala.binary.version}
-
org.apache.spark
spark-graphx_${scala.binary.version}
-
-
- com.arakelian
- java-jq
-
dom4j
dom4j
@@ -101,10 +81,6 @@
com.fasterxml.jackson.core
jackson-databind
-
- com.fasterxml.jackson.core
- jackson-core
-
org.apache.httpcomponents
httpclient
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
index fc0e3bdb9..f73ff92ec 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple3;
import scala.collection.JavaConversions;
@@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
Dataset pivotHistory = spark
.createDataset(
Collections.emptyList(),
- RowEncoder
- .apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+ SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
pivotHistory = spark
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
index e4bcf1e82..c7efce4d7 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple2;
import scala.Tuple3;
@@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
StructType idsSchema = StructType
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
- Dataset allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+ Dataset allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
String entityPath = graphBasePath + '/' + entityType.name();
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
index 732471f99..61506bc60 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@@ -50,7 +50,7 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer {
if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
@@ -61,13 +61,14 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer 12)
+ s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+ else
+ null
+ }
+
+ def findURLForPID(
+ pidValue: List[StructuredProperty],
+ urls: List[String]
+ ): List[(StructuredProperty, String)] = {
+ pidValue.map { p =>
+ val pv = p.getValue
+
+ val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+ (p, r.orNull)
+ }
+ }
+
+ def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+ if (r.getInstance() == null || r.getInstance().isEmpty)
+ return List()
+ r.getInstance()
+ .asScala
+ .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+ .filter(i => i.getPid != null && i.getUrl != null)
+ .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+ .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+ .distinct
+ .toList
+ }
+
+ def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+ if (result.getInstance() == null || result.getInstance().size() == 0)
+ return null
+
+ if (result.getPid == null || result.getPid.isEmpty)
+ return null
+
+ val r = new ScholixResource
+ r.setDnetIdentifier(result.getId)
+
+ val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+ if (persistentIdentifiers.isEmpty)
+ return null
+
+ r.setIdentifier(persistentIdentifiers.asJava)
+
+ r.setObjectType(result.getResulttype.getClassid)
+
+ r.setObjectSubType(
+ result
+ .getInstance()
+ .asScala
+ .filter(i => i != null && i.getInstancetype != null)
+ .map(i => i.getInstancetype.getClassname)
+ .distinct
+ .head
+ )
+
+ if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+ val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+ if (titles.nonEmpty)
+ r.setTitle(titles.head)
+ else
+ return null
+ }
+ if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+ val authors: List[ScholixEntityId] =
+ result.getAuthor.asScala
+ .map(a => {
+ val entity = new ScholixEntityId()
+ entity.setName(a.getFullname)
+ if (a.getPid != null && a.getPid.size() > 0)
+ entity.setIdentifiers(
+ a.getPid.asScala
+ .map(sp => {
+ val id = new ScholixIdentifier()
+ id.setIdentifier(sp.getValue)
+ id.setSchema(sp.getQualifier.getClassid)
+ id
+ })
+ .take(3)
+ .toList
+ .asJava
+ )
+ entity
+ })
+ .toList
+ if (authors.nonEmpty)
+ r.setCreator(authors.asJava)
+
+ }
+
+ val dt: List[String] = result
+ .getInstance()
+ .asScala
+ .filter(i => i.getDateofacceptance != null)
+ .map(i => i.getDateofacceptance.getValue)
+ .toList
+ if (dt.nonEmpty)
+ r.setPublicationDate(dt.distinct.head)
+
+ r.setPublisher(
+ result
+ .getInstance()
+ .asScala
+ .map(i => i.getHostedby)
+ .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+ .map(h => {
+ val eid = new ScholixEntityId()
+ eid.setName(h.getValue)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(h.getKey)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+ eid.setIdentifiers(List(id).asJava)
+ eid
+ })
+ .distinct
+ .asJava
+ )
+
+ r.setCollectedFrom(
+ result.getCollectedfrom.asScala
+ .map(cf => {
+ val scf = new ScholixCollectedFrom()
+ scf.setProvisionMode("collected")
+ scf.setCompletionStatus("complete")
+ val eid = new ScholixEntityId()
+ eid.setName(cf.getValue)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(cf.getKey)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+ eid.setIdentifiers(List(id).asJava)
+ scf.setProvider(eid)
+ scf
+ })
+ .asJava
+ )
+
+ r
+ }
+
+ def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+ val s: Scholix = new Scholix
+ s.setSource(source)
+ if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+ s.setLinkprovider(
+ relation.collectedfrom
+ .map(cf => {
+ val eid = new ScholixEntityId()
+ eid.setName(cf.value)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(cf.key)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+ eid.setIdentifiers(List(id).asJava)
+ eid
+ })
+ .toList
+ .asJava
+ )
+ else {
+ val eid = new ScholixEntityId()
+ eid.setName("OpenAIRE")
+ val id = new ScholixIdentifier()
+ id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+ eid.setIdentifiers(List(id).asJava)
+ s.setLinkprovider(List(eid).asJava)
+ }
+ s.setIdentifier(relation.id)
+ val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+ if (semanticRelation == null)
+ return null
+ s.setRelationship(
+ new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+ )
+ s.setPublicationDate(source.getPublicationDate)
+ s.setPublisher(source.getPublisher)
+ val mockTarget = new ScholixResource
+ mockTarget.setDnetIdentifier(relation.target)
+ s.setTarget(mockTarget)
+ s
+ }
+
+ def updateTarget(s: Scholix, t: ScholixResource): String = {
+
+ s.setTarget(t)
+ val spublishers: Seq[ScholixEntityId] =
+ if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+ val tpublishers: Seq[ScholixEntityId] =
+ if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+ val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+ s.setPublisher(mergedPublishers.asJava)
+ mapper.writeValueAsString(s)
+ }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
new file mode 100644
index 000000000..dd420ab95
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -0,0 +1,141 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+ KeyValue,
+ OtherResearchProduct,
+ Publication,
+ Relation,
+ Result,
+ Software,
+ Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+ extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
+ override def run(): Unit = {
+ val sourcePath = parser.get("sourcePath")
+ log.info("sourcePath: {}", sourcePath)
+ val targetPath = parser.get("targetPath")
+ log.info("targetPath: {}", targetPath)
+ generateBidirectionalRelations(sourcePath, targetPath, spark)
+ generateScholixResource(sourcePath, targetPath, spark)
+ generateScholix(targetPath, spark)
+ }
+
+ def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+ val entityMap: Map[String, StructType] = Map(
+ "publication" -> Encoders.bean(classOf[Publication]).schema,
+ "dataset" -> Encoders.bean(classOf[OafDataset]).schema,
+ "software" -> Encoders.bean(classOf[Software]).schema,
+ "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+ )
+
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+ implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+ val resDs = spark.emptyDataset[ScholixResource]
+ val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+ println(s"adding ${item._1}")
+ res.union(
+ spark.read
+ .schema(item._2)
+ .json(s"$inputPath/${item._1}")
+ .as[Result]
+ .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+ .filter(s => s != null)
+ )
+ })
+ scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+ }
+
+ def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+ val relSchema = Encoders.bean(classOf[Relation]).schema
+
+ val relDF = spark.read
+ .schema(relSchema)
+ .json(s"$inputPath/relation")
+ .where(
+ "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+ "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+ )
+ .select("source", "target", "collectedfrom", "relClass")
+
+ def invRel: String => String = { s =>
+ ScholexplorerUtils.invRel(s)
+ }
+
+ import org.apache.spark.sql.functions.udf
+ val inverseRelationUDF = udf(invRel)
+ val inverseRelation = relDF.select(
+ col("target").alias("source"),
+ col("source").alias("target"),
+ col("collectedfrom"),
+ inverseRelationUDF(col("relClass")).alias("relClass")
+ )
+
+ val bidRel = inverseRelation
+ .union(relDF)
+ .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+ .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+ .drop("collectedfrom")
+ .withColumnRenamed("cf", "collectedfrom")
+ .groupBy(col("id"))
+ .agg(
+ first("source").alias("source"),
+ first("target").alias("target"),
+ first("relClass").alias("relClass"),
+ first("collectedfrom").alias("collectedfrom")
+ )
+
+ bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+ }
+
+ def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+ implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
+
+ import spark.implicits._
+ val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+ val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+ val scholix_one_verse = relations
+ .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+ .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+ .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
+ val resourceTarget = relations
+ .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+ .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
+ scholix_one_verse
+ .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+ .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
+ .write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .text(s"$outputPath/scholix")
+ }
+}
+
+object SparkCreateScholexplorerDump {
+ val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+ def main(args: Array[String]): Unit = {
+ new SparkCreateScholexplorerDump(
+ log = logger,
+ args = args,
+ propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+ ).initialize().run()
+ }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
new file mode 100644
index 000000000..204fe9794
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@@ -0,0 +1,26 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
+
+class ScholixGenerationTest {
+
+ @Test
+ def generateScholix(): Unit = {
+
+ val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+ val app = new SparkCreateScholexplorerDump(null, null, null)
+// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+// app.generateBidirectionalRelations(
+// "/home/sandro/Downloads/scholix_sample/",
+// "/home/sandro/Downloads/scholix/",
+// spark
+// )
+ app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index e62fcdf19..4b4e6c1c4 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -18,7 +18,7 @@
scala-compile-first
- initialize
+ process-resources
add-source
compile
@@ -59,12 +59,6 @@
com.jayway.jsonpath
json-path
-
-
- org.slf4j
- slf4j-api
-
-
dom4j
@@ -160,6 +154,26 @@
org.apache.zookeeper
zookeeper
+
+ ant
+ org.apache.ant
+
+
+ antlr4-runtime
+ org.antlr
+
+
+ woodstox-core
+ com.fasterxml.woodstox
+
+
+ log4j
+ *
+
+
+ org.apache.logging.log4j
+ *
+
@@ -206,5 +220,90 @@
+
+
+ spark-24
+
+ true
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-34
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-35
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
index d49a0596b..78154e0ab 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
+import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
.javaRDD()
.map(
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
- SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+ DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
}
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
index befebe0bb..e1d19b66f 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
@@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
-import javax.swing.text.html.Option;
-
import org.apache.commons.lang3.StringUtils;
import org.stringtemplate.v4.ST;
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 000000000..295f0f54d
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+ static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) {
+ SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 000000000..6b85176a3
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+ static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) {
+ SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
+ }
+}
diff --git a/dhp-workflows/dhp-stats-actionsets/pom.xml b/dhp-workflows/dhp-stats-actionsets/pom.xml
index 3daa8f995..499c598f0 100644
--- a/dhp-workflows/dhp-stats-actionsets/pom.xml
+++ b/dhp-workflows/dhp-stats-actionsets/pom.xml
@@ -16,11 +16,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-hist-snaps/pom.xml b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
index b31d909f9..8961f919a 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/pom.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 059fb9089..26760d650 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,8 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -67,7 +70,9 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -91,7 +96,9 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -109,17 +116,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -127,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
fi
fi
fi
@@ -176,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -204,11 +214,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-monitor-irish/pom.xml b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
index 6ab19dced..600632364 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 1130a684d..26760d650 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -66,7 +70,9 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -90,7 +96,9 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -108,17 +116,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -126,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
fi
fi
fi
@@ -175,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -203,11 +214,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-monitor-update/pom.xml b/dhp-workflows/dhp-stats-monitor-update/pom.xml
index f2bc35f8d..86d5135fa 100644
--- a/dhp-workflows/dhp-stats-monitor-update/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index de275145b..1ab3e417a 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -66,7 +70,9 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -90,7 +96,9 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -108,17 +116,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -126,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
fi
fi
fi
@@ -175,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -203,11 +214,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 6fc0aa745..7957a659c 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -6,6 +6,8 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -68,7 +72,9 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -92,7 +98,9 @@ function copydb() {
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@@ -110,17 +118,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -128,12 +132,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
fi
fi
fi
@@ -177,7 +186,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -205,11 +216,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
STATS_DB=$1
@@ -233,6 +247,6 @@ copydb $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
- tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
+ tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
index eb16a161e..c0993ef0b 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
- lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
-select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
+ lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
+ lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
+select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
- join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
+ join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
+ join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
+
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
index 2691d4b7e..230a077f7 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
@@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
@@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
.map(
(MapFunction) t -> OBJECT_MAPPER.readValue(t, Software.class),
Encoders.bean(Software.class))
- .filter(t -> t.getCodeRepositoryUrl() != null)
+ .filter((FilterFunction) t -> t.getCodeRepositoryUrl() != null)
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
index a9dbb09ae..8ce9826e2 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@@ -39,8 +39,8 @@
UTF-8
UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
+ 1.1.0-cdh5.16.2
+ 2.6.0-cdh5.16.2
@@ -72,7 +72,13 @@
org.apache.hadoop
hadoop-common
${cdh.hadoop.version}
-
+
+
+ jdk.tools
+ jdk.tools
+
+
+
eu.dnetlib.dhp
dhp-common
diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml
index 56aec73b7..4dd987f51 100644
--- a/dhp-workflows/dhp-usage-stats-build/pom.xml
+++ b/dhp-workflows/dhp-usage-stats-build/pom.xml
@@ -39,8 +39,8 @@
UTF-8
UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
+ 1.1.0-cdh5.16.2
+ 2.6.0-cdh5.16.2
@@ -67,11 +67,23 @@
org.apache.hive
hive-jdbc
${cdh.hive.version}
-
+
+
+ jdk.tools
+ jdk.tools
+
+
+
org.apache.hadoop
hadoop-common
${cdh.hadoop.version}
+
+
+ jdk.tools
+ jdk.tools
+
+
eu.dnetlib.dhp
diff --git a/pom.xml b/pom.xml
index bd19bda49..cc8d509f7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,955 +1,1107 @@
- 4.0.0
- eu.dnetlib.dhp
- dhp
- 1.2.5-SNAPSHOT
- pom
-
-
-
- GNU Affero General Public License v3.0 or later
- https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText
- repo
- This program is free software: you can redistribute it and/or modify it under the terms of the
- GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
-
-
-
-
- dhp-build
- dhp-pace-core
- dhp-common
- dhp-workflows
-
-
-
- Redmine
- https://support.openaire.eu/projects/openaire
-
-
-
- jenkins
- https://jenkins-dnet.d4science.org/
-
-
-
- scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
- scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
- https://code-repo.d4science.org/D-Net/dnet-hadoop/
- HEAD
-
-
- This module is the root descriptor for the dnet-hadoop project
-
-
-
-
-
-
- dnet45-releases
- D-Net 45 releases
- https://maven.d4science.org/nexus/content/repositories/dnet45-releases
- default
-
- false
-
-
- true
-
-
-
- dnet45-snapshots
- D-Net 45 snapshots
- https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
- default
-
- true
-
-
- false
-
-
-
- dnet45-bootstrap-snapshot
- D-Net 45 Bootstrap Snapshot
- https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/
-
- false
-
-
- true
-
- default
-
-
- dnet45-bootstrap-release
- D-Net 45 Bootstrap Release
- https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/
-
- true
-
-
- false
-
- default
-
-
- cloudera
- Cloudera Repository
- https://repository.cloudera.com/artifactory/cloudera-repos
-
- true
-
-
- false
-
-
-
- dnet-deps
- dnet-dependencies
- https://maven.d4science.org/nexus/content/repositories/dnet-deps
- default
-
-
- maven-restlet
- Restlet repository
- https://maven.restlet.talend.com
-
-
- conjars
- conjars
- https://conjars.wensel.net/repo/
-
-
-
-
-
- org.junit.jupiter
- junit-jupiter
- ${junit-jupiter.version}
- test
-
-
-
- org.mockito
- mockito-core
- ${mockito-core.version}
- test
-
-
-
- org.mockito
- mockito-junit-jupiter
- ${mockito-core.version}
- test
-
-
-
-
-
-
-
- eu.dnetlib.dhp
- ${dhp-schemas.artifact}
- ${dhp-schemas.version}
-
-
- org.apache.hadoop
- hadoop-hdfs
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-common
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-client
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-distcp
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.spark
- spark-core_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-sql_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-graphx_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-hive_${scala.binary.version}
- ${dhp.spark.version}
- test
-
-
-
- org.slf4j
- jcl-over-slf4j
- 1.7.25
- provided
-
-
-
- org.apache.commons
- commons-lang3
- ${dhp.commons.lang.version}
-
-
-
- commons-validator
- commons-validator
- 1.7
-
-
-
- com.github.sisyphsu
- dateparser
- 1.0.7
-
-
-
- me.xuender
- unidecode
- 0.0.7
-
-
-
- com.google.guava
- guava
- ${dhp.guava.version}
-
-
-
-
- commons-codec
- commons-codec
- 1.9
-
-
-
- commons-io
- commons-io
- 2.4
-
-
-
- commons-cli
- commons-cli
- 1.2
- provided
-
-
-
- net.sf.saxon
- Saxon-HE
- 9.9.1-6
-
-
-
- dom4j
- dom4j
- 1.6.1
-
-
-
- xml-apis
- xml-apis
- 1.4.01
-
-
-
- jaxen
- jaxen
- 1.1.6
-
-
-
- com.mycila.xmltool
- xmltool
- 3.3
-
-
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- *
- *
-
-
-
-
- com.lucidworks.spark
- spark-solr
- ${sparksolr.version}
-
-
- *
- *
-
-
-
-
- org.apache.solr
- solr-test-framework
- ${solr.version}
- test
-
-
- io.dropwizard.metrics
- metrics-core
- 3.2.6
- test
-
-
-
-
- org.apache.httpcomponents
- httpclient
- ${org.apache.httpcomponents.version}
-
-
- org.apache.httpcomponents
- httpmime
- ${org.apache.httpcomponents.version}
-
-
- org.noggit
- noggit
- 0.8
-
-
- org.apache.zookeeper
- zookeeper
- 3.4.11
-
-
-
- net.schmizz
- sshj
- 0.10.0
- test
-
-
-
- com.fasterxml.jackson.core
- jackson-core
- ${dhp.jackson.version}
- provided
-
-
-
- com.fasterxml.jackson.core
- jackson-annotations
- ${dhp.jackson.version}
- provided
-
-
- com.fasterxml.jackson.core
- jackson-databind
- ${dhp.jackson.version}
- provided
-
-
-
- eu.dnetlib
- dnet-actionmanager-common
- ${dnet-actionmanager-common.version}
-
-
- org.apache.hadoop
- hadoop-common
-
-
-
-
- eu.dnetlib
- dnet-actionmanager-api
- ${dnet-actionmanager-api.version}
-
-
- eu.dnetlib
- cnr-misc-utils
-
-
-
-
-
- eu.dnetlib
- cnr-rmi-api
- ${cnr-rmi-api.version}
-
-
-
- eu.dnetlib.dhp
- dnet-openaire-broker-common
- ${dnet-openaire-broker-common.version}
-
-
-
- org.apache.cxf
- cxf-rt-transports-http
- 3.1.5
-
-
- javax.persistence
- javax.persistence-api
- 2.2
- provided
-
-
-
- com.jayway.jsonpath
- json-path
- 2.4.0
-
-
- com.arakelian
- java-jq
- 0.10.1
-
-
- edu.cmu
- secondstring
- 1.0.0
-
-
- org.mongodb
- mongo-java-driver
- ${mongodb.driver.version}
-
-
- io.fares.junit.mongodb
- mongodb-junit-test
- 1.1.0
-
-
- org.postgresql
- postgresql
- 42.2.10
-
-
-
- org.antlr
- stringtemplate
- 3.2.1
-
-
-
- org.antlr
- ST4
- 4.3.4
-
-
-
- com.ximpleware
- vtd-xml
- ${vtd.version}
-
-
-
- org.elasticsearch
- elasticsearch-hadoop
- 7.6.0
-
-
-
-
- org.apache.oozie
- oozie-client
- ${dhp.oozie.version}
- provided
-
-
-
- slf4j-simple
- org.slf4j
-
-
-
-
-
-
- com.squareup.okhttp3
- okhttp
- ${okhttp.version}
-
-
-
- org.apache.commons
- commons-compress
- ${common.compress.version}
-
-
-
-
- org.apache.commons
- commons-csv
- ${common.csv.version}
-
-
-
-
-
- org.apache.poi
- poi-ooxml
- ${apache.poi.version}
-
-
-
- org.json
- json
- 20180813
-
-
-
- org.json4s
- json4s-jackson_${scala.binary.version}
- ${json4s.version}
-
-
-
- com.github.victools
- jsonschema-generator
- ${jsonschemagenerator.version}
-
-
-
- org.apache.commons
- commons-text
- ${common.text.version}
-
-
-
- com.opencsv
- opencsv
- 5.5
-
-
- io.github.classgraph
- classgraph
- 4.8.71
-
-
-
- com.fasterxml.jackson.dataformat
- jackson-dataformat-xml
- ${jackson.version}
- provided
-
-
- com.fasterxml.jackson.module
- jackson-module-jsonSchema
- ${jackson.version}
- provided
-
-
-
-
- org.apache.commons
- commons-math3
- 3.6.1
-
-
-
-
- com.google.code.gson
- gson
- ${google.gson.version}
-
-
-
- commons-collections
- commons-collections
- ${commons.collections.version}
-
-
- commons-logging
- commons-logging
- ${commons.logging.version}
-
-
-
- org.reflections
- reflections
- 0.9.10
-
-
-
- org.scala-lang
- scala-library
- ${scala.version}
-
-
-
- com.ibm.icu
- icu4j
- 70.1
-
-
-
-
-
- target
- target/classes
- ${project.artifactId}-${project.version}
- target/test-classes
-
-
-
- org.apache.maven.plugins
- maven-plugin-plugin
- 3.3
-
-
- org.apache.maven.plugins
- maven-project-info-reports-plugin
- 3.0.0
-
-
- org.apache.maven.plugins
- maven-site-plugin
- 3.9.1
-
- ${dhp.site.skip}
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- ${maven.compiler.plugin.version}
-
-
- 1.8
- ${project.build.sourceEncoding}
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 3.0.2
-
-
-
- org.apache.maven.plugins
- maven-source-plugin
- 3.0.1
-
-
- attach-sources
- verify
-
- jar-no-fork
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M4
-
- true
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
-
- true
- none
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
- 3.6.0
-
-
-
- net.revelc.code.formatter
- formatter-maven-plugin
- 2.11.0
-
-
- eu.dnetlib.dhp
- dhp-code-style
- ${project.version}
-
-
-
-
- org.antipathy
- mvn-scalafmt_${scala.binary.version}
- 1.0.1640073709.733712b
-
-
- eu.dnetlib.dhp
- dhp-code-style
- ${project.version}
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-site-plugin
-
-
- org.apache.maven.plugins
- maven-project-info-reports-plugin
-
-
- net.revelc.code.formatter
- formatter-maven-plugin
-
-
-
- format
-
-
- eclipse/formatter_dnet.xml
-
-
-
-
-
- net.revelc.code
- impsort-maven-plugin
- 1.4.1
-
- java.,javax.,org.,com.
- java,*
-
- **/thrift/*.java
-
-
-
-
- sort-imports
-
- sort
-
-
-
-
-
- org.antipathy
- mvn-scalafmt_${scala.binary.version}
-
- https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
- false
- false
-
- ${project.basedir}/src/main/scala
-
-
- ${project.basedir}/src/test/scala
-
- false
- false
- : git rev-parse --abbrev-ref HEAD
- false
-
-
-
- validate
-
- format
-
-
-
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 2.5.3
-
-
- org.jacoco
- jacoco-maven-plugin
- 0.7.9
-
-
- **/schemas/*
- **/com/cloudera/**/*
- **/org/apache/avro/io/**/*
-
-
-
-
- default-prepare-agent
-
- prepare-agent
-
-
-
- default-report
- prepare-package
-
- report
-
-
-
-
-
-
-
-
-
- org.apache.maven.wagon
- wagon-ssh
- 2.10
-
-
-
-
-
- dnet45-snapshots
- DNet45 Snapshots
- https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
- default
-
-
- dnet45-releases
- https://maven.d4science.org/nexus/content/repositories/dnet45-releases
-
-
- DHPSite
- ${dhp.site.stage.path}/
-
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
-
- true
- none
-
-
-
-
-
-
- sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop
- UTF-8
- UTF-8
- 3.6.0
- 1.8
- 1.8
- 2.22.2
- 2.0.1
- cdh5.9.2
- 2.6.0-${dhp.cdh.version}
- 4.1.0-${dhp.cdh.version}
- dhp-schemas
- 3.6.0
- 2.4.0.cloudera2
- 2.9.6
- 3.5
- true
- 11.0.2
- 2.11.12
- 2.11
- 1.3.0
- 5.6.1
- 3.3.3
- 3.4.2
- [2.12,3.0)
- [6.1.2]
- [4.0.3]
- [6.0.5]
- [3.1.6]
- [2.6.1]
- 7.5.0
- 4.7.2
- 1.20
- 3.5.3
- 4.13.0
- 1.8
- 4.1.2
- 1.8
- 4.5.3
- 4.0.1
- 2.2.2
- 1.1.3
- 3.2.1
-
-
-
-
-
- scala-2.12
-
- 2.12
- 2.12.18
-
-
-
- 4.0.2
- 3.4.1
- 2.14.2
- 3.12.0
- 3.7.0-M11
- 4.8.1
-
-
-
-
-
-
-
- arm-silicon-mac
-
-
- aarch64
- mac
-
-
-
-
-
- org.xerial.snappy
- snappy-java
- 1.1.8.4
-
-
-
-
+ 4.0.0
+ eu.dnetlib.dhp
+ dhp
+ 1.2.5-SNAPSHOT
+ pom
+
+
+
+ GNU Affero General Public License v3.0 or later
+ https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText
+ repo
+ This program is free software: you can redistribute it and/or modify it under the terms of the
+ GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+
+
+
+
+ dhp-build
+ dhp-pace-core
+ dhp-common
+ dhp-workflows
+ dhp-shade-package
+
+
+
+ Redmine
+ https://support.openaire.eu/projects/openaire
+
+
+
+ jenkins
+ https://jenkins-dnet.d4science.org/
+
+
+
+ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
+ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
+ https://code-repo.d4science.org/D-Net/dnet-hadoop/
+ HEAD
+
+
+ This module is the root descriptor for the dnet-hadoop project
+
+
+
+
+
+
+
+ Openaire-third-parties-snaphot
+ Openaire third parties Snapshot
+ https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/
+
+ false
+
+
+ true
+
+
+
+
+ dnet45-releases
+ D-Net 45 releases
+ https://maven.d4science.org/nexus/content/repositories/dnet45-releases
+ default
+
+ false
+
+
+ true
+
+
+
+ dnet45-snapshots
+ D-Net 45 snapshots
+ https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
+ default
+
+ true
+
+
+ false
+
+
+
+ dnet45-bootstrap-snapshot
+ D-Net 45 Bootstrap Snapshot
+ https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/
+
+ false
+
+
+ true
+
+ default
+
+
+ dnet45-bootstrap-release
+ D-Net 45 Bootstrap Release
+ https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/
+
+ true
+
+
+ false
+
+ default
+
+
+ cloudera
+ Cloudera Repository
+ https://repository.cloudera.com/artifactory/cloudera-repos
+
+ true
+
+
+ false
+
+
+
+ dnet-deps
+ dnet-dependencies
+ https://maven.d4science.org/nexus/content/repositories/dnet-deps
+ default
+
+
+ maven-restlet
+ Restlet repository
+ https://maven.restlet.talend.com
+
+
+ conjars
+ conjars
+ https://conjars.wensel.net/repo/
+
+
+
+
+
+
+ org.projectlombok
+ lombok
+ 1.18.28
+ provided
+
+
+ org.junit.jupiter
+ junit-jupiter
+ ${junit-jupiter.version}
+ test
+
+
+
+ org.mockito
+ mockito-core
+ ${mockito-core.version}
+ test
+
+
+
+ org.mockito
+ mockito-junit-jupiter
+ ${mockito-core.version}
+ test
+
+
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${dhp-schemas.version}
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-distcp
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-graphx_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-hive_${scala.binary.version}
+ ${dhp.spark.version}
+ test
+
+
+
+ org.slf4j
+ slf4j-api
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.slf4j
+ jcl-over-slf4j
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.apache.logging.log4j
+ log4j-slf4j2-impl
+ ${log4j.version}
+
+
+ org.apache.logging.log4j
+ log4j-api
+ ${log4j.version}
+
+
+ org.apache.logging.log4j
+ log4j-core
+ ${log4j.version}
+
+
+
+ org.apache.logging.log4j
+ log4j-1.2-api
+ ${log4j.version}
+
+
+
+ org.apache.commons
+ commons-lang3
+ ${dhp.commons.lang.version}
+
+
+
+ org.apache.commons
+ commons-beanutils
+ ${commons-beanutils.version}
+
+
+
+
+ commons-validator
+ commons-validator
+ ${commons-validator.version}
+
+
+
+ com.github.sisyphsu
+ dateparser
+ ${dateparser.version}
+
+
+
+ me.xuender
+ unidecode
+ ${unidecode.version}
+
+
+
+ com.google.guava
+ guava
+ ${dhp.guava.version}
+
+
+
+
+ commons-codec
+ commons-codec
+ ${commons-codec.version}
+
+
+
+ commons-io
+ commons-io
+ ${commons-io.version}
+
+
+
+ commons-cli
+ commons-cli
+ 1.2
+ provided
+
+
+
+ net.sf.saxon
+ Saxon-HE
+ 9.9.1-6
+
+
+
+ dom4j
+ dom4j
+ 1.6.1
+
+
+
+ xml-apis
+ xml-apis
+ 1.4.01
+
+
+
+ jaxen
+ jaxen
+ 1.1.6
+
+
+
+ com.mycila.xmltool
+ xmltool
+ 3.3
+
+
+
+ org.apache.solr
+ solr-solrj
+ ${solr.version}
+
+
+ *
+ *
+
+
+
+
+ com.lucidworks.spark
+ spark-solr
+ ${sparksolr.version}
+
+
+ *
+ *
+
+
+
+
+ org.apache.solr
+ solr-test-framework
+ ${solr.version}
+ test
+
+
+ io.dropwizard.metrics
+ metrics-core
+ 3.2.6
+ test
+
+
+
+
+ org.apache.httpcomponents
+ httpclient
+ ${org.apache.httpcomponents.version}
+
+
+ org.apache.httpcomponents
+ httpmime
+ ${org.apache.httpcomponents.version}
+
+
+ org.noggit
+ noggit
+ 0.8
+
+
+ org.apache.zookeeper
+ zookeeper
+ ${zookeeper.version}
+
+
+
+ net.schmizz
+ sshj
+ 0.10.0
+ test
+
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ ${dhp.jackson.version}
+ provided
+
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ ${dhp.jackson.version}
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${dhp.jackson.version}
+ provided
+
+
+
+ eu.dnetlib
+ dnet-actionmanager-common
+ ${dnet-actionmanager-common.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+
+
+
+
+ eu.dnetlib
+ dnet-actionmanager-api
+ ${dnet-actionmanager-api.version}
+
+
+ eu.dnetlib
+ cnr-misc-utils
+
+
+
+
+
+ eu.dnetlib
+ cnr-rmi-api
+ ${cnr-rmi-api.version}
+
+
+
+ eu.dnetlib.dhp
+ dnet-openaire-broker-common
+ ${dnet-openaire-broker-common.version}
+
+
+
+ org.apache.cxf
+ cxf-rt-transports-http
+ 3.1.5
+
+
+
+ javax.persistence
+ javax.persistence-api
+ 2.2
+ provided
+
+
+
+ com.jayway.jsonpath
+ json-path
+ 2.4.0
+
+
+ com.arakelian
+ java-jq
+ 0.10.1
+
+
+ edu.cmu
+ secondstring
+ 1.0.0
+
+
+ org.mongodb
+ mongo-java-driver
+ ${mongodb.driver.version}
+
+
+ io.fares.junit.mongodb
+ mongodb-junit-test
+ 1.1.0
+
+
+ org.postgresql
+ postgresql
+ 42.2.10
+
+
+
+ org.antlr
+ stringtemplate
+ 3.2.1
+
+
+
+ org.antlr
+ ST4
+ 4.3.4
+
+
+
+ com.ximpleware
+ vtd-xml
+ ${vtd.version}
+
+
+
+ org.elasticsearch
+ elasticsearch-hadoop
+ 7.6.0
+
+
+
+
+ org.apache.oozie
+ oozie-client
+ ${dhp.oozie.version}
+ provided
+
+
+
+ slf4j-simple
+ org.slf4j
+
+
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ ${okhttp.version}
+
+
+
+ org.apache.commons
+ commons-compress
+ ${common.compress.version}
+
+
+ org.apache.commons
+ commons-csv
+ ${common.csv.version}
+
+
+ org.apache.poi
+ poi-ooxml
+ ${apache.poi.version}
+
+
+
+ org.json
+ json
+ 20180813
+
+
+
+ org.json4s
+ json4s-jackson_${scala.binary.version}
+ ${json4s.version}
+
+
+
+ com.github.victools
+ jsonschema-generator
+ ${jsonschemagenerator.version}
+
+
+
+ org.apache.commons
+ commons-text
+ ${common.text.version}
+
+
+
+ com.opencsv
+ opencsv
+ 5.5
+
+
+ io.github.classgraph
+ classgraph
+ 4.8.71
+
+
+
+ com.fasterxml.jackson.dataformat
+ jackson-dataformat-xml
+ ${jackson.version}
+ provided
+
+
+ com.fasterxml.jackson.module
+ jackson-module-jsonSchema
+ ${jackson.version}
+ provided
+
+
+
+ org.apache.commons
+ commons-math3
+ 3.6.1
+
+
+
+ com.google.code.gson
+ gson
+ ${google.gson.version}
+
+
+
+ commons-collections
+ commons-collections
+ ${commons.collections.version}
+
+
+ commons-logging
+ commons-logging
+ ${commons.logging.version}
+
+
+
+ org.reflections
+ reflections
+ ${reflections.version}
+
+
+
+ org.scala-lang
+ scala-library
+ ${scala.version}
+
+
+
+ com.ibm.icu
+ icu4j
+ 70.1
+
+
+
+ org.javassist
+ javassist
+ ${javassist.version}
+
+
+
+
+
+ target
+ target/classes
+ ${project.artifactId}-${project.version}
+ target/test-classes
+
+
+
+ org.apache.maven.plugins
+ maven-plugin-plugin
+ 3.3
+
+
+ org.apache.maven.plugins
+ maven-project-info-reports-plugin
+ 3.0.0
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+ 3.9.1
+
+ ${dhp.site.skip}
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ ${maven.compiler.plugin.version}
+
+
+ 1.8
+ ${project.build.sourceEncoding}
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.0.2
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.0.1
+
+
+ attach-sources
+ verify
+
+ jar-no-fork
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M4
+
+ true
+ false
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.2.0
+
+ true
+ none
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.6.0
+
+
+
+ net.revelc.code.formatter
+ formatter-maven-plugin
+ 2.11.0
+
+
+ eu.dnetlib.dhp
+ dhp-code-style
+ ${project.version}
+
+
+
+
+ org.antipathy
+ mvn-scalafmt_${scala.binary.version}
+ 1.0.1640073709.733712b
+
+
+ eu.dnetlib.dhp
+ dhp-code-style
+ ${project.version}
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+
+
+ org.apache.maven.plugins
+ maven-project-info-reports-plugin
+
+
+ net.revelc.code.formatter
+ formatter-maven-plugin
+
+
+
+ format
+
+
+ eclipse/formatter_dnet.xml
+
+
+
+
+
+ net.revelc.code
+ impsort-maven-plugin
+ 1.6.2
+
+ java.,javax.,org.,com.
+ java,*
+
+ **/thrift/*.java
+
+
+
+
+ sort-imports
+
+ sort
+
+
+
+
+
+ org.antipathy
+ mvn-scalafmt_${scala.binary.version}
+
+
+ https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
+
+ false
+ false
+
+ ${project.basedir}/src/main/scala
+
+
+ ${project.basedir}/src/test/scala
+
+ false
+ false
+ : git rev-parse --abbrev-ref HEAD
+ false
+
+
+
+ validate
+
+ format
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-release-plugin
+ 2.5.3
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.10
+
+
+ **/schemas/*
+ **/com/cloudera/**/*
+ **/org/apache/avro/io/**/*
+
+
+
+
+ default-prepare-agent
+
+ prepare-agent
+
+
+
+ default-report
+ prepare-package
+
+ report
+
+
+
+
+
+
+
+
+
+ org.apache.maven.wagon
+ wagon-ssh
+ 2.10
+
+
+
+
+
+ dnet45-snapshots
+ DNet45 Snapshots
+ https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
+ default
+
+
+ dnet45-releases
+ https://maven.d4science.org/nexus/content/repositories/dnet45-releases
+
+
+ DHPSite
+ ${dhp.site.stage.path}/
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+ true
+ none
+
+
+
+
+
+
+ sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop
+ UTF-8
+ UTF-8
+ 1.8
+ 1.8
+
+
+ 2.11.12
+ 2.11
+
+
+ 3.6.0
+ 2.22.2
+ 2.0.1
+ 4.0.1
+
+
+ 4.1.2
+ [2.6.1]
+ 1.20
+ 1.8
+ 1.8
+ 1.9.4
+ 1.9
+ 3.2.1
+ 2.4
+ 1.1.3
+ 1.7
+ 1.0.7
+ [6.1.2]
+ cdh5.9.2
+ 3.5
+ 11.0.2
+ 2.6.0-${dhp.cdh.version}
+ 2.9.6
+ 4.1.0-${dhp.cdh.version}
+ true
+ 2.4.0.cloudera2
+ [4.0.3]
+ [6.0.5]
+ [3.1.6]
+ 2.2.2
+ 1.2.17
+ 3.19.0-GA
+ 3.5.3
+ 4.13.0
+ 5.6.1
+ 3.3.3
+ 3.4.2
+ 4.7.2
+ 4.5.3
+ 1.7.25
+ 0.9.10
+ 1.3.0
+ 7.5.0
+ 3.6.0
+ 0.0.7
+ [2.12,3.0)
+ 3.4.6
+
+
+
+
+
+ spark-34
+
+ 2.12
+ 2.12.18
+ 1.3.0
+
+
+ 4.8.1
+
+
+ 1.22
+ 1.8
+ 1.10.0
+ 1.9.4
+ 1.15
+ 3.2.2
+ 2.11.0
+ 1.1.3
+ 1.7
+
+ 14.0.1
+ 8.11.0
+ 4.0.4
+ 3.4.2.openaire
+ 2.14.2
+ 3.12.0
+ 2.19.0
+ 3.7.0-M11
+ 3.25.0-GA
+ 4.10.0
+ 2.0.6
+ 0.10.2
+ 3.6.3
+
+
+
+
+ spark-35
+
+ 2.12
+ 2.12.18
+ 1.3.0
+
+
+ 4.8.1
+
+
+ 1.23.0
+ 1.8
+ 1.10.0
+ 1.9.4
+ 1.16.0
+ 3.2.2
+ 2.13.0
+ 1.1.3
+ 1.7
+
+ 14.0.1
+ 8.11.0
+ 4.0.4
+ 3.5.1.openaire-SNAPSHOT
+ 2.15.2
+ 3.12.0
+ 2.20.0
+ 3.7.0-M11
+ 3.25.0-GA
+ 4.10.0
+ 2.0.7
+ 0.10.2
+ 3.6.3
+
+
+
+
+ java11
+
+ [11
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M4
+
+
+ --add-opens=java.base/java.lang=ALL-UNNAMED
+ --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
+ --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
+ --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
+ --add-opens=java.base/java.nio=ALL-UNNAMED
+ --add-opens=java.base/java.util=ALL-UNNAMED
+ --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
+ --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
+ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
+ --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
+ --add-opens=java.base/sun.security.action=ALL-UNNAMED
+ --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
+
+ true
+ false
+
+
+
+
+
+
\ No newline at end of file