diff --git a/.gitignore b/.gitignore
index 14cd4d3450..6fafc70555 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ spark-warehouse
/**/.factorypath
/**/.scalafmt.conf
/.java-version
+/dhp-shade-package/dependency-reduced-pom.xml
diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
index 84b962b4b8..eddcd88678 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder;
// execute
- Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
+ try {
+ mojo.execute();
+ Assertions.assertTrue(false); // not reached
+ } catch (Exception e) {
+ Assertions
+ .assertTrue(
+ MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
+ IllegalArgumentException.class.isAssignableFrom(e.getClass()));
+ }
}
@Test
diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index c2f76cff7b..bfec019af6 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -70,10 +70,7 @@
com.ibm.icu
icu4j
-
- org.apache.hadoop
- hadoop-common
-
+
com.github.sisyphsu
dateparser
@@ -163,7 +160,7 @@
eu.dnetlib.dhp
- ${dhp-schemas.artifact}
+ dhp-schemas
@@ -172,4 +169,23 @@
+
+
+
+ spark-34
+
+
+ javax.xml.bind
+ jaxb-api
+ 2.2.11
+
+
+ com.sun.xml.ws
+ jaxws-ri
+ 2.3.3
+ pom
+
+
+
+
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
index fac9a75650..fbf586f8c5 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java
@@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
- } catch (IOException e) {
+ } catch (Exception e) {
throw new RuntimeException(e);
}
}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
index 544da78f53..fe4f58f067 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java
@@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
- * @throws IOException
- * @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1);
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
index 342d73cdc2..78bb99e79a 100644
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java
@@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.HttpHeaders;
-import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
index 98e8daa18c..4f0cee53d7 100644
--- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
+++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json
@@ -154,5 +154,13 @@
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
+ },
+ "isamongtopnsimilardocuments": {
+ "original": "IsAmongTopNSimilarDocuments",
+ "inverse": "HasAmongTopNSimilarDocuments"
+ },
+ "hasamongtopnsimilardocuments": {
+ "original": "HasAmongTopNSimilarDocuments",
+ "inverse": "IsAmongTopNSimilarDocuments"
}
}
\ No newline at end of file
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
index a14c258379..526bbd2953 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala
@@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
- SparkSession
+ val b = SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
- .master(master)
- .getOrCreate()
+ if (master != null)
+ b.master(master)
+ b.getOrCreate()
}
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
index a995016a8d..72a17777e9 100644
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala
@@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
}
def generateScholixResourceFromResult(r: Result): ScholixResource = {
- generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+ val sum = ScholixUtils.resultToSummary(r)
+ if (sum != null)
+ generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
+ else
+ null
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
}
+ def invRel(rel: String): String = {
+ val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
+ if (semanticRelation != null)
+ semanticRelation.inverse
+ else
+ null
+ }
+
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
- if (r.isInstanceOf[Publication])
- s.setTypology(Typology.publication)
- else
- s.setTypology(Typology.dataset)
+// s.setTypology(r.getResulttype.getClassid)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index 6c706b6928..52ec2a2536 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -24,7 +24,7 @@
scala-compile-first
- initialize
+ process-resources
add-source
compile
@@ -59,14 +59,6 @@
edu.cmu
secondstring
-
- com.google.guava
- guava
-
-
- com.google.code.gson
- gson
-
org.apache.commons
commons-lang3
@@ -91,10 +83,6 @@
com.fasterxml.jackson.core
jackson-databind
-
- org.apache.commons
- commons-math3
-
com.jayway.jsonpath
json-path
@@ -113,4 +101,90 @@
+
+
+ spark-24
+
+ true
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-34
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-35
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 6bfb8b3f4b..b055077d89 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -1,12 +1,6 @@
package eu.dnetlib.pace.common;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
@@ -15,6 +9,13 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
/**
* Set of common functions for the framework
*
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
index aa04188dae..e6a1c4ccc1 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.util.MapDocumentUtil
+import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
- df.map(r => rowFromJson(r))(RowEncoder(schema))
+ df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
}
def rowFromJson(json: String): Row = {
diff --git a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 0000000000..a426703d67
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+ def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+ RowEncoder(schema)
+ }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
new file mode 100644
index 0000000000..cbc454ae2c
--- /dev/null
+++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.types.StructType
+
+object SparkCompatUtils {
+
+ def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
+ ExpressionEncoder(schema)
+ }
+}
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
index 93db552c17..be5c1ebb90 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java
@@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person;
+import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest {
diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml
new file mode 100644
index 0000000000..d8e17ed465
--- /dev/null
+++ b/dhp-shade-package/pom.xml
@@ -0,0 +1,169 @@
+
+
+ 4.0.0
+
+ eu.dnetlib.dhp
+ dhp
+ 1.2.5-SNAPSHOT
+ ../pom.xml
+
+
+
+ dhp-shade-package
+ jar
+
+
+
+ DHPSite
+ ${dhp.site.stage.path}/dhp-common
+
+
+
+ This module create a jar of all module dependencies
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-actionmanager
+ ${project.version}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-graph-mapper
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-graph-provision
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-impact-indicators
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-actionsets
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-hist-snaps
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-monitor-irish
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-promote
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-stats-update
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-swh
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-usage-raw-data-update
+ ${project.version}
+
+
+ eu.dnetlib.dhp
+ dhp-usage-stats-build
+ ${project.version}
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ package
+
+ shade
+
+
+
+
+ eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels
+
+
+
+
+ META-INF/cxf/bus-extensions.txt
+
+
+
+
+ *:*
+
+ META-INF/maven/**
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+ com
+ repackaged.com.google.common
+
+ com.google.common.**
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
index c28d81992b..c10eb5c8c5 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@@ -9,6 +9,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
@@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
.union(openAPCRelations)
.union(dataciteRelations)
.saveAsHadoopFile(
- outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+ outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
});
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index 040c897829..c1e0c4d68e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
@@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
resultsRDD
.union(projectsRDD)
.saveAsHadoopFile(
- outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+ outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
});
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
index eb370e981a..27970f2c34 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java
@@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.StructType;
@@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
+ final String blackListInputPath = parser.get("blackListPath");
+ log.info("blackListInputPath: {}", blackListInputPath);
+
SparkConf conf = new SparkConf();
runWithSparkSession(
@@ -77,29 +81,35 @@ public class CreateActionSetFromWebEntries implements Serializable {
isSparkSessionManaged,
spark -> {
- createActionSet(spark, inputPath, outputPath);
+ createActionSet(spark, inputPath, outputPath, blackListInputPath);
});
}
public static void createActionSet(SparkSession spark, String inputPath,
- String outputPath) {
+ String outputPath, String blackListInputPath) {
final Dataset dataset = readWebCrawl(spark, inputPath)
- .filter("publication_year <= 2020 or country_code=='IE'")
+ .filter("country_code=='IE'")
.drop("publication_year");
- dataset.flatMap((FlatMapFunction) row -> {
- List ret = new ArrayList<>();
- final String ror = ROR_PREFIX
- + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
- ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
- ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
- ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+ final Dataset blackList = readBlackList(spark, blackListInputPath);
- return ret
- .iterator();
- }, Encoders.bean(Relation.class))
+ dataset
+ .join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
+ .filter((FilterFunction) r -> r.getAs("OpenAlexId") == null)
+ .drop("OpenAlexId")
+ .flatMap((FlatMapFunction) row -> {
+ List ret = new ArrayList<>();
+ final String ror = ROR_PREFIX
+ + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
+ ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
+ ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
+ ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
+
+ return ret
+ .iterator();
+ }, Encoders.bean(Relation.class))
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
@@ -136,6 +146,15 @@ public class CreateActionSetFromWebEntries implements Serializable {
}
+ private static Dataset readBlackList(SparkSession spark, String inputPath) {
+
+ return spark
+ .read()
+ .option("header", true)
+ .csv(inputPath)
+ .select("OpenAlexId");
+ }
+
private static List createAffiliationRelationPairPMCID(String pmcid, String ror) {
if (pmcid == null)
return new ArrayList<>();
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
index 997948687b..f4ba09f72b 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java
@@ -1,6 +1,7 @@
package eu.dnetlib.dhp.collection.plugin.rest;
+import java.util.Map;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
@@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
+import com.google.gson.Gson;
+
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
final String entityXpath = api.getParams().get("entityXpath");
final String authMethod = api.getParams().get("authMethod");
final String authToken = api.getParams().get("authToken");
+ final String requestHeaderMap = api.getParams().get("requestHeaderMap");
+ Gson gson = new Gson();
+ Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
final String resultSizeValue = Optional
.ofNullable(api.getParams().get("resultSizeValue"))
.filter(StringUtils::isNotBlank)
@@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
if (StringUtils.isBlank(resultFormatValue)) {
throw new CollectorException("Param 'resultFormatValue' is null or empty");
}
- if (StringUtils.isBlank(queryParams)) {
- throw new CollectorException("Param 'queryParams' is null or empty");
- }
if (StringUtils.isBlank(entityXpath)) {
throw new CollectorException("Param 'entityXpath' is null or empty");
}
@@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
entityXpath,
authMethod,
authToken,
- resultOutputFormat);
+ resultOutputFormat,
+ requestHeaders);
return StreamSupport
.stream(
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
index 76af6cff1a..2518fd92fe 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java
@@ -9,6 +9,7 @@ import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
+import java.util.Map;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
@@ -34,6 +35,8 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
+import com.google.common.collect.Maps;
+
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
@@ -55,7 +58,7 @@ public class RestIterator implements Iterator {
private final HttpClientParams clientParams;
- private final String BASIC = "basic";
+ private final String AUTHBASIC = "basic";
private final String baseUrl;
private final String resumptionType;
@@ -89,6 +92,11 @@ public class RestIterator implements Iterator {
*/
private final String resultOutputFormat;
+ /*
+ * Can be used to set additional request headers, like for content negotiation
+ */
+ private Map requestHeaders;
+
/**
* RestIterator class compatible to version 1.3.33
*/
@@ -107,7 +115,8 @@ public class RestIterator implements Iterator {
final String entityXpath,
final String authMethod,
final String authToken,
- final String resultOutputFormat) {
+ final String resultOutputFormat,
+ final Map requestHeaders) {
this.clientParams = clientParams;
this.baseUrl = baseUrl;
@@ -119,6 +128,7 @@ public class RestIterator implements Iterator {
this.authMethod = authMethod;
this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat;
+ this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
@@ -231,25 +241,20 @@ public class RestIterator implements Iterator {
final URL qUrl = new URL(query);
log.debug("authMethod: {}", this.authMethod);
- if ("bearer".equalsIgnoreCase(this.authMethod)) {
- log.trace("authMethod before inputStream: {}", resultXml);
- final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
- conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
- conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
- conn.setRequestMethod("GET");
- theHttpInputStream = conn.getInputStream();
- } else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
- log.trace("authMethod before inputStream: {}", resultXml);
- final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
- conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
- conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
- conn.setRequestMethod("GET");
- theHttpInputStream = conn.getInputStream();
- } else {
- theHttpInputStream = qUrl.openStream();
+ if (this.authMethod == "bearer") {
+ log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+ requestHeaders.put("Authorization", "Bearer " + authToken);
+ // requestHeaders.put("Content-Type", "application/json");
+ } else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
+ log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
+ requestHeaders.put("Authorization", "Basic " + authToken);
+ // requestHeaders.put("accept", "application/xml");
}
+ HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
+ conn.setRequestMethod("GET");
+ this.setRequestHeader(conn);
+ resultStream = conn.getInputStream();
- this.resultStream = theHttpInputStream;
if ("json".equals(this.resultOutputFormat)) {
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson);
@@ -380,7 +385,8 @@ public class RestIterator implements Iterator {
try {
if (this.resultTotal == -1) {
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
- if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
+ if ("page".equalsIgnoreCase(this.resumptionType)
+ && !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
this.resultTotal += 1;
} // to correct the upper bound
log.info("resultTotal was -1 is now: " + this.resultTotal);
@@ -433,6 +439,22 @@ public class RestIterator implements Iterator {
}
}
+ /**
+ * setRequestHeader
+ *
+ * setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
+ * @param conn
+ */
+ private void setRequestHeader(HttpURLConnection conn) {
+ if (requestHeaders != null) {
+ for (String key : requestHeaders.keySet()) {
+ conn.setRequestProperty(key, requestHeaders.get(key));
+ }
+ log.debug("Set Request Header with: " + requestHeaders);
+ }
+
+ }
+
public String getResultFormatValue() {
return this.resultFormatValue;
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
index 3f056edf77..b79140b3a7 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json
@@ -16,5 +16,10 @@
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
- }
+ },{
+ "paramName": "bl",
+ "paramLongName": "blackListPath",
+ "paramDescription": "the working path",
+ "paramRequired": true
+}
]
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
index f616baea70..d7bd709fca 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/job.properties
@@ -1,2 +1,3 @@
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
outputPath=/tmp/miriam/webcrawlComplete/
+blackListPath=/user/miriam.baglioni/openalex-blackList
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
index 653a7d3842..b9394c7e69 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/webcrawl/oozie_app/workflow.xml
@@ -45,6 +45,7 @@
--sourcePath${sourcePath}
--outputPath${outputPath}
+ --blackListPath${blackListPath}
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
index f0275e06bc..e4f491e5c6 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@@ -625,12 +625,6 @@
"name": "Alimentary Health",
"synonym": []
},
- {
- "id": "501100011103",
- "uri": "http://dx.doi.org/10.13039/501100011103",
- "name": "Rann\u00eds",
- "synonym": []
- },
{
"id": "501100012354",
"uri": "http://dx.doi.org/10.13039/501100012354",
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
index 44c82e256b..c4aa64fd49 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala
@@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2)
case "print" => journal.setIssnPrinted(tp._2)
+ case _ =>
}
})
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
index df22a6b845..b065db3340 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/MagUtility.scala
@@ -79,23 +79,6 @@ object MagUtility extends Serializable {
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
private val MAGDataInfo: DataInfo = {
- val di = new DataInfo
- di.setDeletedbyinference(false)
- di.setInferred(false)
- di.setInvisible(false)
- di.setTrust("0.9")
- di.setProvenanceaction(
- OafMapperUtils.qualifier(
- ModelConstants.SYSIMPORT_ACTIONSET,
- ModelConstants.SYSIMPORT_ACTIONSET,
- ModelConstants.DNET_PROVENANCE_ACTIONS,
- ModelConstants.DNET_PROVENANCE_ACTIONS
- )
- )
- di
- }
-
- private val MAGDataInfoInvisible: DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
@@ -453,7 +436,6 @@ object MagUtility extends Serializable {
case "repository" =>
result = new Publication()
- result.setDataInfo(MAGDataInfoInvisible)
qualifier(
"0038",
"Other literature type",
@@ -488,8 +470,7 @@ object MagUtility extends Serializable {
}
if (result != null) {
- if (result.getDataInfo == null)
- result.setDataInfo(MAGDataInfo)
+ result.setDataInfo(MAGDataInfo)
val i = new Instance
i.setInstancetype(tp)
i.setInstanceTypeMapping(
@@ -512,7 +493,7 @@ object MagUtility extends Serializable {
return null
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
- val pidList = List(
+ var pidList = List(
structuredProperty(
paper.paperId.get.toString,
qualifier(
@@ -525,8 +506,6 @@ object MagUtility extends Serializable {
)
)
- result.setPid(pidList.asJava)
-
result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@@ -618,22 +597,23 @@ object MagUtility extends Serializable {
}
val instance = result.getInstance().get(0)
- instance.setPid(pidList.asJava)
- if (paper.doi.orNull != null)
- instance.setAlternateIdentifier(
- List(
- structuredProperty(
- paper.doi.get,
- qualifier(
- PidType.doi.toString,
- PidType.doi.toString,
- ModelConstants.DNET_PID_TYPES,
- ModelConstants.DNET_PID_TYPES
- ),
- null
- )
- ).asJava
+
+ if (paper.doi.orNull != null) {
+ pidList = pidList ::: List(
+ structuredProperty(
+ paper.doi.get,
+ qualifier(
+ PidType.doi.toString,
+ PidType.doi.toString,
+ ModelConstants.DNET_PID_TYPES,
+ ModelConstants.DNET_PID_TYPES
+ ),
+ null
+ )
)
+ }
+ instance.setPid(pidList.asJava)
+ result.setPid(pidList.asJava)
instance.setUrl(paper.urls.get.asJava)
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
instance.setCollectedfrom(MAGCollectedFrom)
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
index 5dd38970de..208a1dc660 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/mag/SparkMAGtoOAF.scala
@@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
spark.read
.load(s"$magBasePath/mag_denormalized")
.as[MAGPaper]
+ .filter(col("doi").isNotNull)
.map(s => MagUtility.convertMAGtoOAF(s))
.filter(s => s != null)
.write
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
index 639918151b..11d087583e 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
-import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
+import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.pubmed._
-import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
@@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._
+import org.apache.spark.sql.expressions.Aggregator
import org.slf4j.{Logger, LoggerFactory}
-import java.io.InputStream
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.charset.Charset
+import javax.xml.stream.XMLInputFactory
object SparkCreateBaselineDataFrame {
@@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
} else
- return IOUtils.toString(response.getEntity.getContent)
+ return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
@@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
- )
+ ),
+ Charset.defaultCharset()
)
)
parser.parseArgument(args)
@@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
val workingPath = parser.get("workingPath")
log.info("workingPath: {}", workingPath)
- val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
- log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
-
- val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
- val outputBasePath = cleanedMdStoreVersion.getHdfsPath
- log.info("outputBasePath: {}", outputBasePath)
+ val targetPath = parser.get("targetPath")
+ log.info("targetPath: {}", targetPath)
val hdfsServerUri = parser.get("hdfsServerUri")
- log.info("hdfsServerUri: {}", hdfsServerUri)
+ log.info("hdfsServerUri: {}", targetPath)
val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate)
@@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
+ val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => {
- val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+ val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml)
})
)
@@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
.map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf]
.filter(p => p != null),
- s"$outputBasePath/$MDSTORE_DATA_PATH"
+ targetPath
)
- val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
- val mdStoreSize = df.count
- writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
index 9102c12c43..fb941a461c 100644
--- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala
@@ -1,7 +1,8 @@
package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData
-import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
+import javax.xml.stream.XMLEventReader
+import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml
*/
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
index ce116688a2..0a4dfc00bd 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java
@@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
index 3b416caf2c..ebde0ed0c3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java
@@ -119,7 +119,9 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI",
"-outputPath",
workingDir.toString() + "/COCI_json/",
- "-inputFile", "input1;input2;input3;input4;input5"
+ "-inputFile", "input1;input2;input3;input4;input5",
+ "-format",
+ "COCI"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
index 402f07d4d7..e9291f93c5 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateASTest.java
@@ -75,7 +75,11 @@ public class CreateASTest {
String inputPath = getClass()
.getResource(
- "/eu/dnetlib/dhp/actionmanager/webcrawl/")
+ "/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
+ .getPath();
+ String blackListPath = getClass()
+ .getResource(
+ "/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
.getPath();
CreateActionSetFromWebEntries
@@ -86,7 +90,8 @@ public class CreateASTest {
"-sourcePath",
inputPath,
"-outputPath",
- workingDir.toString() + "/actionSet1"
+ workingDir.toString() + "/actionSet1",
+ "-blackListPath", blackListPath
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@@ -96,7 +101,7 @@ public class CreateASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
- Assertions.assertEquals(64, tmp.count());
+ Assertions.assertEquals(58, tmp.count());
}
@@ -109,6 +114,10 @@ public class CreateASTest {
.getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
.getPath();
+ String blackListPath = getClass()
+ .getResource(
+ "/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
+ .getPath();
CreateActionSetFromWebEntries
.main(
@@ -118,7 +127,8 @@ public class CreateASTest {
"-sourcePath",
inputPath,
"-outputPath",
- workingDir.toString() + "/actionSet1"
+ workingDir.toString() + "/actionSet1",
+ "-blackListPath", blackListPath
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
@@ -184,7 +194,7 @@ public class CreateASTest {
Assertions
.assertEquals(
- 5, tmp
+ 2, tmp
.filter(
r -> r
.getSource()
@@ -197,7 +207,7 @@ public class CreateASTest {
Assertions
.assertEquals(
- 5, tmp
+ 2, tmp
.filter(
r -> r
.getTarget()
@@ -210,7 +220,7 @@ public class CreateASTest {
Assertions
.assertEquals(
- 2, tmp
+ 1, tmp
.filter(
r -> r
.getTarget()
@@ -224,7 +234,7 @@ public class CreateASTest {
Assertions
.assertEquals(
- 2, tmp
+ 1, tmp
.filter(
r -> r
.getTarget()
@@ -238,7 +248,7 @@ public class CreateASTest {
Assertions
.assertEquals(
- 1, tmp
+ 0, tmp
.filter(
r -> r
.getTarget()
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
index 90f4c7f25b..0e64f8bab2 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest {
private final String resumptionType = "page";
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
- private final String resultSizeParam = "";
- private final String resultSizeValue = "";
+ private final String resultSizeParam = "page[size]";
+ private final String resultSizeValue = "100";
private final String resultFormatParam = "format";
private final String resultFormatValue = "json";
@@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
final AtomicInteger i = new AtomicInteger(0);
final Stream stream = this.rcp.collect(this.api, new AggregatorReport());
- stream.limit(200).forEach(s -> {
+ stream.limit(2000).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
i.incrementAndGet();
log.info(s);
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
index f708c367b3..99b95d9e38 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java
@@ -4,6 +4,11 @@
package eu.dnetlib.dhp.collection.plugin.rest;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
@@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.gson.Gson;
+
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
@@ -25,18 +32,18 @@ class RestCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
- private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
- private final String resumptionType = "count";
- private final String resumptionParam = "from";
- private final String entityXpath = "//hits/hits";
- private final String resumptionXpath = "//hits";
- private final String resultTotalXpath = "//hits/total";
- private final String resultFormatParam = "format";
+ private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
+ private final String resumptionType = "discover";
+ private final String resumptionParam = "skip";
+ private final String entityXpath = "//*[local-name()='data']";
+ private final String resumptionXpath = "";
+ private final String resultTotalXpath = "//*[local-name()='count']";
+ private final String resultFormatParam = "";
private final String resultFormatValue = "json";
- private final String resultSizeParam = "size";
+ private final String resultSizeParam = "top";
private final String resultSizeValue = "10";
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
- private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
+ private final String query = "";
// private String query = "=(sources:engrXiv AND type:preprint)";
private final String protocolDescriptor = "rest_json2xml";
@@ -56,6 +63,7 @@ class RestCollectorPluginTest {
params.put("resultSizeValue", resultSizeValue);
params.put("queryParams", query);
params.put("entityXpath", entityXpath);
+ params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
api.setBaseUrl(baseUrl);
api.setParams(params);
@@ -78,4 +86,19 @@ class RestCollectorPluginTest {
log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0);
}
+
+ @Disabled
+ @Test
+ void testUrl() throws IOException {
+ String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
+ URL url = new URL(url_s);
+ final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod("GET");
+ conn.setRequestProperty("User-Agent", "OpenAIRE");
+ Gson gson = new Gson();
+ System.out.println("Request header");
+ System.out.println(gson.toJson(conn.getHeaderFields()));
+ InputStream inputStream = conn.getInputStream();
+
+ }
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
index e2d6ad3e7f..ed31c2b7ec 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java
@@ -44,7 +44,7 @@ public class RestIteratorTest {
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
- query, entityXpath, authMethod, authToken, resultOffsetParam);
+ query, entityXpath, authMethod, authToken, resultOffsetParam, null);
int i = 20;
while (iterator.hasNext() && i > 0) {
String result = iterator.next();
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00000 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00000
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00000
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00001 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00001
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00001
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00002 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/part-00002
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/input/part-00002
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
index 2a9e391df8..2f1af2a6e3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json
@@ -789,10 +789,6 @@
"value": "2227-9717",
"type": "electronic"
},
- {
- "value": "VALUE",
- "type": "PIPPO"
- },
{
"value": "1063-4584",
"type": "pu"
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
index ed43bb1a19..c3ea884eb3 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala
@@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
-import org.junit.jupiter.api.BeforeEach
+import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
+import org.apache.commons.io.IOUtils
+import org.junit.jupiter.api.{BeforeEach, Test}
import org.junit.jupiter.api.extension.ExtendWith
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
@@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
super.setUpVocabulary()
}
+ @Test
+ def mappingRecord(): Unit = {
+ val input =
+ IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
+
+ println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
+
+ }
+
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
index 59b91d66b1..77812affb4 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/mag/MAGMappingTest.scala
@@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
@@ -18,10 +19,8 @@ class MAGMappingTest {
.master("local[*]")
.getOrCreate()
- val s = new SparkMagOrganizationAS(null, null, null)
-
- s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
-
+ val s = new SparkMAGtoOAF(null, null, null)
+ s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
}
@Test
diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
index d1611300d2..c4af14c409 100644
--- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala
@@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream
+import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.io.Source
@@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testEBIData() = {
- val inputXML = Source
- .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- .mkString
- val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
+ val inputFactory = XMLInputFactory.newInstance
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
}
@@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testParsingPubmedXML(): Unit = {
- val xml = new XMLEventReader(
- Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- )
+ val inputFactory = XMLInputFactory.newInstance
+
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
val parser = new PMParser(xml)
parser.foreach(checkPMArticle)
}
@@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test
def testPubmedMapping(): Unit = {
- val xml = new XMLEventReader(
- Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
- )
+ val inputFactory = XMLInputFactory.newInstance
+ val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
+
val parser = new PMParser(xml)
val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml
index 8665ebd056..897fa1a761 100644
--- a/dhp-workflows/dhp-dedup-openaire/pom.xml
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@@ -53,24 +53,10 @@
dhp-pace-core
${project.version}
-
org.apache.commons
commons-lang3
-
-
- org.scala-lang.modules
- scala-java8-compat_${scala.binary.version}
- 1.0.2
-
-
-
- org.scala-lang.modules
- scala-collection-compat_${scala.binary.version}
- 2.11.0
-
-
org.apache.spark
spark-core_${scala.binary.version}
@@ -79,16 +65,10 @@
org.apache.spark
spark-sql_${scala.binary.version}
-
org.apache.spark
spark-graphx_${scala.binary.version}
-
-
- com.arakelian
- java-jq
-
dom4j
dom4j
@@ -101,10 +81,6 @@
com.fasterxml.jackson.core
jackson-databind
-
- com.fasterxml.jackson.core
- jackson-core
-
org.apache.httpcomponents
httpclient
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
index fc0e3bdb9f..f73ff92ec7 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java
@@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple3;
import scala.collection.JavaConversions;
@@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
Dataset pivotHistory = spark
.createDataset(
Collections.emptyList(),
- RowEncoder
- .apply(StructType.fromDDL("id STRING, lastUsage STRING")));
+ SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
pivotHistory = spark
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
index d12048b028..0507b7b9af 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareNewOrgs.java
@@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
+import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
.map(
(MapFunction, Tuple2>, OrgSimRel>) r -> new OrgSimRel(
"",
- r._1()._2().getOriginalId().get(0),
- r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
- r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
- r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
- r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
- r._1()._2().getCollectedfrom().get(0).getValue(),
+ Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
+ Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
+ Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
+ Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
+ Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
+ Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
"",
structuredPropertyListToString(r._1()._2().getPid()),
parseECField(r._1()._2().getEclegalbody()),
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
index 61325ab502..83ec7e5222 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPrepareOrgRels.java
@@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
final Organization o = r._2()._2();
return new OrgSimRel(
r._1()._1(),
- o.getOriginalId().get(0),
+ Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
@@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
.map(
(MapFunction, Tuple2>, OrgSimRel>) r -> {
OrgSimRel orgSimRel = r._1()._2();
- orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
+ orgSimRel
+ .setLocal_id(
+ Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
return orgSimRel;
},
Encoders.bean(OrgSimRel.class));
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
index e4bcf1e827..c7efce4d74 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java
@@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*;
-import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple2;
import scala.Tuple3;
@@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
StructType idsSchema = StructType
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
- Dataset allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
+ Dataset allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
String entityPath = graphBasePath + '/' + entityType.name();
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
index 732471f99e..61506bc600 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@@ -50,7 +50,7 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer {
if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
@@ -61,13 +61,14 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer 12)
+ s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
+ else
+ null
+ }
+
+ def findURLForPID(
+ pidValue: List[StructuredProperty],
+ urls: List[String]
+ ): List[(StructuredProperty, String)] = {
+ pidValue.map { p =>
+ val pv = p.getValue
+
+ val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
+ (p, r.orNull)
+ }
+ }
+
+ def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
+ if (r.getInstance() == null || r.getInstance().isEmpty)
+ return List()
+ r.getInstance()
+ .asScala
+ .filter(i => i.getUrl != null && !i.getUrl.isEmpty)
+ .filter(i => i.getPid != null && i.getUrl != null)
+ .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
+ .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
+ .distinct
+ .toList
+ }
+
+ def generateScholixResourceFromResult(result: Result): ScholixResource = {
+
+ if (result.getInstance() == null || result.getInstance().size() == 0)
+ return null
+
+ if (result.getPid == null || result.getPid.isEmpty)
+ return null
+
+ val r = new ScholixResource
+ r.setDnetIdentifier(result.getId)
+
+ val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
+ if (persistentIdentifiers.isEmpty)
+ return null
+
+ r.setIdentifier(persistentIdentifiers.asJava)
+
+ r.setObjectType(result.getResulttype.getClassid)
+
+ r.setObjectSubType(
+ result
+ .getInstance()
+ .asScala
+ .filter(i => i != null && i.getInstancetype != null)
+ .map(i => i.getInstancetype.getClassname)
+ .distinct
+ .head
+ )
+
+ if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
+ val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
+ if (titles.nonEmpty)
+ r.setTitle(titles.head)
+ else
+ return null
+ }
+ if (result.getAuthor != null && !result.getAuthor.isEmpty) {
+ val authors: List[ScholixEntityId] =
+ result.getAuthor.asScala
+ .map(a => {
+ val entity = new ScholixEntityId()
+ entity.setName(a.getFullname)
+ if (a.getPid != null && a.getPid.size() > 0)
+ entity.setIdentifiers(
+ a.getPid.asScala
+ .map(sp => {
+ val id = new ScholixIdentifier()
+ id.setIdentifier(sp.getValue)
+ id.setSchema(sp.getQualifier.getClassid)
+ id
+ })
+ .take(3)
+ .toList
+ .asJava
+ )
+ entity
+ })
+ .toList
+ if (authors.nonEmpty)
+ r.setCreator(authors.asJava)
+
+ }
+
+ val dt: List[String] = result
+ .getInstance()
+ .asScala
+ .filter(i => i.getDateofacceptance != null)
+ .map(i => i.getDateofacceptance.getValue)
+ .toList
+ if (dt.nonEmpty)
+ r.setPublicationDate(dt.distinct.head)
+
+ r.setPublisher(
+ result
+ .getInstance()
+ .asScala
+ .map(i => i.getHostedby)
+ .filter(h => !"unknown".equalsIgnoreCase(h.getValue))
+ .map(h => {
+ val eid = new ScholixEntityId()
+ eid.setName(h.getValue)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(h.getKey)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
+ eid.setIdentifiers(List(id).asJava)
+ eid
+ })
+ .distinct
+ .asJava
+ )
+
+ r.setCollectedFrom(
+ result.getCollectedfrom.asScala
+ .map(cf => {
+ val scf = new ScholixCollectedFrom()
+ scf.setProvisionMode("collected")
+ scf.setCompletionStatus("complete")
+ val eid = new ScholixEntityId()
+ eid.setName(cf.getValue)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(cf.getKey)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
+ eid.setIdentifiers(List(id).asJava)
+ scf.setProvider(eid)
+ scf
+ })
+ .asJava
+ )
+
+ r
+ }
+
+ def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
+ val s: Scholix = new Scholix
+ s.setSource(source)
+ if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
+ s.setLinkprovider(
+ relation.collectedfrom
+ .map(cf => {
+ val eid = new ScholixEntityId()
+ eid.setName(cf.value)
+ val id = new ScholixIdentifier()
+ id.setIdentifier(cf.key)
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
+ eid.setIdentifiers(List(id).asJava)
+ eid
+ })
+ .toList
+ .asJava
+ )
+ else {
+ val eid = new ScholixEntityId()
+ eid.setName("OpenAIRE")
+ val id = new ScholixIdentifier()
+ id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
+ id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
+ id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
+ eid.setIdentifiers(List(id).asJava)
+ s.setLinkprovider(List(eid).asJava)
+ }
+ s.setIdentifier(relation.id)
+ val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
+ if (semanticRelation == null)
+ return null
+ s.setRelationship(
+ new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
+ )
+ s.setPublicationDate(source.getPublicationDate)
+ s.setPublisher(source.getPublisher)
+ val mockTarget = new ScholixResource
+ mockTarget.setDnetIdentifier(relation.target)
+ s.setTarget(mockTarget)
+ s
+ }
+
+ def updateTarget(s: Scholix, t: ScholixResource): String = {
+
+ s.setTarget(t)
+ val spublishers: Seq[ScholixEntityId] =
+ if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
+ val tpublishers: Seq[ScholixEntityId] =
+ if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
+ val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
+ s.setPublisher(mergedPublishers.asJava)
+ mapper.writeValueAsString(s)
+ }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
new file mode 100644
index 0000000000..dd420ab956
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala
@@ -0,0 +1,141 @@
+package eu.dnetlib.dhp.sx.graph
+
+import eu.dnetlib.dhp.application.AbstractScalaApplication
+import eu.dnetlib.dhp.schema.oaf.{
+ KeyValue,
+ OtherResearchProduct,
+ Publication,
+ Relation,
+ Result,
+ Software,
+ Dataset => OafDataset
+}
+import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
+import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
+ extends AbstractScalaApplication(propertyPath, args, log: Logger) {
+
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
+ override def run(): Unit = {
+ val sourcePath = parser.get("sourcePath")
+ log.info("sourcePath: {}", sourcePath)
+ val targetPath = parser.get("targetPath")
+ log.info("targetPath: {}", targetPath)
+ generateBidirectionalRelations(sourcePath, targetPath, spark)
+ generateScholixResource(sourcePath, targetPath, spark)
+ generateScholix(targetPath, spark)
+ }
+
+ def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
+ val entityMap: Map[String, StructType] = Map(
+ "publication" -> Encoders.bean(classOf[Publication]).schema,
+ "dataset" -> Encoders.bean(classOf[OafDataset]).schema,
+ "software" -> Encoders.bean(classOf[Software]).schema,
+ "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
+ )
+
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+ implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
+
+ val resDs = spark.emptyDataset[ScholixResource]
+ val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
+ println(s"adding ${item._1}")
+ res.union(
+ spark.read
+ .schema(item._2)
+ .json(s"$inputPath/${item._1}")
+ .as[Result]
+ .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
+ .filter(s => s != null)
+ )
+ })
+ scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
+ }
+
+ def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
+ val relSchema = Encoders.bean(classOf[Relation]).schema
+
+ val relDF = spark.read
+ .schema(relSchema)
+ .json(s"$inputPath/relation")
+ .where(
+ "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
+ "and relClass <> 'merges' and relClass <> 'isMergedIn'"
+ )
+ .select("source", "target", "collectedfrom", "relClass")
+
+ def invRel: String => String = { s =>
+ ScholexplorerUtils.invRel(s)
+ }
+
+ import org.apache.spark.sql.functions.udf
+ val inverseRelationUDF = udf(invRel)
+ val inverseRelation = relDF.select(
+ col("target").alias("source"),
+ col("source").alias("target"),
+ col("collectedfrom"),
+ inverseRelationUDF(col("relClass")).alias("relClass")
+ )
+
+ val bidRel = inverseRelation
+ .union(relDF)
+ .withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
+ .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
+ .drop("collectedfrom")
+ .withColumnRenamed("cf", "collectedfrom")
+ .groupBy(col("id"))
+ .agg(
+ first("source").alias("source"),
+ first("target").alias("target"),
+ first("relClass").alias("relClass"),
+ first("collectedfrom").alias("collectedfrom")
+ )
+
+ bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
+
+ }
+
+ def generateScholix(outputPath: String, spark: SparkSession): Unit = {
+ implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
+ implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
+
+ import spark.implicits._
+ val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
+ val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
+
+ val scholix_one_verse = relations
+ .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
+ .map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
+ .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
+
+ val resourceTarget = relations
+ .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
+ .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
+
+ scholix_one_verse
+ .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
+ .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
+ .write
+ .mode(SaveMode.Overwrite)
+ .option("compression", "gzip")
+ .text(s"$outputPath/scholix")
+ }
+}
+
+object SparkCreateScholexplorerDump {
+ val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
+
+ def main(args: Array[String]): Unit = {
+ new SparkCreateScholexplorerDump(
+ log = logger,
+ args = args,
+ propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
+ ).initialize().run()
+ }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
new file mode 100644
index 0000000000..204fe97941
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala
@@ -0,0 +1,26 @@
+package eu.dnetlib.dhp.sx.graph.scholix
+
+import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
+import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
+import org.junit.jupiter.api.Test
+import org.objenesis.strategy.StdInstantiatorStrategy
+
+class ScholixGenerationTest {
+
+ @Test
+ def generateScholix(): Unit = {
+
+ val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
+ val app = new SparkCreateScholexplorerDump(null, null, null)
+// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
+// app.generateBidirectionalRelations(
+// "/home/sandro/Downloads/scholix_sample/",
+// "/home/sandro/Downloads/scholix/",
+// spark
+// )
+ app.generateScholix("/home/sandro/Downloads/scholix/", spark)
+
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml
index e62fcdf198..4b4e6c1c4c 100644
--- a/dhp-workflows/dhp-graph-provision/pom.xml
+++ b/dhp-workflows/dhp-graph-provision/pom.xml
@@ -18,7 +18,7 @@
scala-compile-first
- initialize
+ process-resources
add-source
compile
@@ -59,12 +59,6 @@
com.jayway.jsonpath
json-path
-
-
- org.slf4j
- slf4j-api
-
-
dom4j
@@ -160,6 +154,26 @@
org.apache.zookeeper
zookeeper
+
+ ant
+ org.apache.ant
+
+
+ antlr4-runtime
+ org.antlr
+
+
+ woodstox-core
+ com.fasterxml.woodstox
+
+
+ log4j
+ *
+
+
+ org.apache.logging.log4j
+ *
+
@@ -206,5 +220,90 @@
+
+
+ spark-24
+
+ true
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-34
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ spark-35
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.4.0
+
+
+ generate-sources
+
+ add-source
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
index d49a0596b7..78154e0ab2 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java
@@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
+import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
.javaRDD()
.map(
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
- SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
+ DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
}
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
index befebe0bb7..e1d19b66f2 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java
@@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
-import javax.swing.text.html.Option;
-
import org.apache.commons.lang3.StringUtils;
import org.stringtemplate.v4.ST;
diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
index b4d021b683..fbd647ae4d 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java
@@ -170,30 +170,19 @@ public class XmlSerializationUtils {
return sb.toString();
}
- // infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE
+ //
+ //
public static String usageMeasureAsXmlElement(String name, Measure measure) {
- HashSet dsIds = Optional
- .ofNullable(measure.getUnit())
- .map(
- m -> m
- .stream()
- .map(KeyValue::getKey)
- .collect(Collectors.toCollection(HashSet::new)))
- .orElse(new HashSet<>());
-
StringBuilder sb = new StringBuilder();
- dsIds.forEach(dsId -> {
+ for (KeyValue kv : measure.getUnit()) {
sb
.append("<")
- .append(name);
- for (KeyValue kv : measure.getUnit()) {
- sb.append(" ").append(attr(measure.getId(), kv.getValue()));
- }
- sb
+ .append(name)
.append(" ")
- .append(attr("datasource", dsId))
- .append("/>");
- });
+ .append(attr(measure.getId(), kv.getValue()))
+ .append(attr("datasource", kv.getKey()))
+ .append(" />");
+ }
return sb.toString();
}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
index 50acb4526f..a754c7a5da 100644
--- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml
@@ -15,8 +15,8 @@
validateXML
- should the payload converter validate the XMLs
false
+ should the payload converter validate the XMLs
relPartitions
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 0000000000..295f0f54d7
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+ static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) {
+ SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
+ }
+}
diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
new file mode 100644
index 0000000000..6b85176a3b
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.dhp.sparksolr;
+
+import com.lucidworks.spark.util.SolrSupport;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.spark.rdd.RDD;
+
+public class DHPSolrSupport {
+
+ static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) {
+ SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
+ }
+}
diff --git a/dhp-workflows/dhp-stats-actionsets/pom.xml b/dhp-workflows/dhp-stats-actionsets/pom.xml
index 3daa8f9959..499c598f07 100644
--- a/dhp-workflows/dhp-stats-actionsets/pom.xml
+++ b/dhp-workflows/dhp-stats-actionsets/pom.xml
@@ -16,11 +16,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-hist-snaps/pom.xml b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
index b31d909f97..8961f919ac 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/pom.xml
+++ b/dhp-workflows/dhp-stats-hist-snaps/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
index 059fb90894..ca0f7a6433 100644
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,8 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -67,7 +70,11 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ else
+ return 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -85,17 +92,30 @@ function copydb() {
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
- # Check the exit status of the "hadoop distcp" command.
- if [ $? -eq 0 ]; then
- echo -e "\nSuccessfully copied the files of '${db}'.\n"
+ if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+ echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ else
+ return 3
+ fi
fi
- # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
- #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+ # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+ hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+ # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+ if [ $? -ne 0 ]; then # Check the exit status..
+ echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+ rm -f error.log
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ else
+ return 4
+ fi
+ fi
echo -e "\nCreating schema for db: '${db}'\n"
@@ -109,17 +129,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -127,12 +143,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
fi
fi
fi
@@ -176,7 +197,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -204,11 +227,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 8
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-monitor-irish/pom.xml b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
index 6ab19dced3..6006323648 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-irish/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
index 1130a684da..ca0f7a6433 100644
--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -66,7 +70,11 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ else
+ return 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -84,17 +92,30 @@ function copydb() {
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
- # Check the exit status of the "hadoop distcp" command.
- if [ $? -eq 0 ]; then
- echo -e "\nSuccessfully copied the files of '${db}'.\n"
+ if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+ echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ else
+ return 3
+ fi
fi
- # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
- #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+ # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+ hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+ # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+ if [ $? -ne 0 ]; then # Check the exit status..
+ echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+ rm -f error.log
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ else
+ return 4
+ fi
+ fi
echo -e "\nCreating schema for db: '${db}'\n"
@@ -108,17 +129,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -126,12 +143,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
fi
fi
fi
@@ -175,7 +197,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -203,11 +227,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 8
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-monitor-update/pom.xml b/dhp-workflows/dhp-stats-monitor-update/pom.xml
index f2bc35f8dc..86d5135faa 100644
--- a/dhp-workflows/dhp-stats-monitor-update/pom.xml
+++ b/dhp-workflows/dhp-stats-monitor-update/pom.xml
@@ -10,11 +10,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
index de275145b3..dd2203eef3 100644
--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
-
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -66,7 +70,11 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ else
+ return 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -84,17 +92,30 @@ function copydb() {
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
- # Check the exit status of the "hadoop distcp" command.
- if [ $? -eq 0 ]; then
- echo -e "\nSuccessfully copied the files of '${db}'.\n"
+ if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+ echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ else
+ return 3
+ fi
fi
- # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
- #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+ # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+ hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+ # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+ if [ $? -ne 0 ]; then # Check the exit status..
+ echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+ rm -f error.log
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ else
+ return 4
+ fi
+ fi
echo -e "\nCreating schema for db: '${db}'\n"
@@ -108,17 +129,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -126,12 +143,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
fi
fi
fi
@@ -175,7 +197,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -203,11 +227,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 8
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
index 6fc0aa7456..918775f495 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@@ -6,6 +6,8 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
+SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
+
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
@@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
- exit 1
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 1
+ fi
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
-# Set the SED command arguments for column-names with reserved words:
-DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
-DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
-DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
-
-HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
-HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
-HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
-
-LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
-LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
-LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
-
export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
+function print_elapsed_time()
+{
+ start_time=$1
+ end_time=$(date +%s)
+ elapsed_time=$(($end_time-$start_time))
+ hours=$((elapsed_time / 3600))
+ minutes=$(((elapsed_time % 3600) / 60))
+ seconds=$((elapsed_time % 60))
+ printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
+}
+
+
function copydb() {
db=$1
+ start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
@@ -68,7 +72,11 @@ function copydb() {
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
- exit 2
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 2
+ else
+ return 2
+ fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
@@ -86,17 +94,30 @@ function copydb() {
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
- # Check the exit status of the "hadoop distcp" command.
- if [ $? -eq 0 ]; then
- echo -e "\nSuccessfully copied the files of '${db}'.\n"
+ if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+ echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log
- exit 3
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 3
+ else
+ return 3
+ fi
fi
- # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
- #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+ # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+ hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+ # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+ if [ $? -ne 0 ]; then # Check the exit status..
+ echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+ rm -f error.log
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 4
+ else
+ return 4
+ fi
+ fi
echo -e "\nCreating schema for db: '${db}'\n"
@@ -110,17 +131,13 @@ function copydb() {
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
- for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
+ for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
- create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
-
- create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
+ create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
+ create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
- create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
- | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
- | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
- | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
+ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@@ -128,12 +145,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
- exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 5
+ fi
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 6
+ fi
fi
fi
fi
@@ -177,7 +199,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
- exit 5
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 7
+ fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else
@@ -205,11 +229,14 @@ function copydb() {
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
- exit 6
+ if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+ exit 8
+ fi
fi
rm -f error.log
- echo -e "\n\nFinished processing db: ${db}\n\n"
+ echo -e "\n\nFinished processing db: ${db}\n"
+ print_elapsed_time start_db_time
}
STATS_DB=$1
@@ -233,6 +260,6 @@ copydb $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
- tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
+ tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done
\ No newline at end of file
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
index eb16a161e9..c0993ef0b3 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
@@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
- lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
-select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
+ lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
+ lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
+select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
- join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
+ join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
+ join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
+
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
index 2691d4b7ec..230a077f7e 100644
--- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
+++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java
@@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
@@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
.map(
(MapFunction) t -> OBJECT_MAPPER.readValue(t, Software.class),
Encoders.bean(Software.class))
- .filter(t -> t.getCodeRepositoryUrl() != null)
+ .filter((FilterFunction) t -> t.getCodeRepositoryUrl() != null)
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
}
diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
index a9dbb09ae1..8ce9826e2a 100644
--- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml
+++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml
@@ -39,8 +39,8 @@
UTF-8
UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
+ 1.1.0-cdh5.16.2
+ 2.6.0-cdh5.16.2
@@ -72,7 +72,13 @@
org.apache.hadoop
hadoop-common
${cdh.hadoop.version}
-
+
+
+ jdk.tools
+ jdk.tools
+
+
+
eu.dnetlib.dhp
dhp-common
diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml
index 56aec73b78..4dd987f515 100644
--- a/dhp-workflows/dhp-usage-stats-build/pom.xml
+++ b/dhp-workflows/dhp-usage-stats-build/pom.xml
@@ -39,8 +39,8 @@
UTF-8
UTF-8
- 0.13.1-cdh5.2.1
- 2.5.0-cdh5.2.1
+ 1.1.0-cdh5.16.2
+ 2.6.0-cdh5.16.2
@@ -67,11 +67,23 @@
org.apache.hive
hive-jdbc
${cdh.hive.version}
-
+
+
+ jdk.tools
+ jdk.tools
+
+
+
org.apache.hadoop
hadoop-common
${cdh.hadoop.version}
+
+
+ jdk.tools
+ jdk.tools
+
+
eu.dnetlib.dhp
diff --git a/pom.xml b/pom.xml
index bd19bda499..cc8d509f7f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,955 +1,1107 @@
- 4.0.0
- eu.dnetlib.dhp
- dhp
- 1.2.5-SNAPSHOT
- pom
-
-
-
- GNU Affero General Public License v3.0 or later
- https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText
- repo
- This program is free software: you can redistribute it and/or modify it under the terms of the
- GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
-
-
-
-
- dhp-build
- dhp-pace-core
- dhp-common
- dhp-workflows
-
-
-
- Redmine
- https://support.openaire.eu/projects/openaire
-
-
-
- jenkins
- https://jenkins-dnet.d4science.org/
-
-
-
- scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
- scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
- https://code-repo.d4science.org/D-Net/dnet-hadoop/
- HEAD
-
-
- This module is the root descriptor for the dnet-hadoop project
-
-
-
-
-
-
- dnet45-releases
- D-Net 45 releases
- https://maven.d4science.org/nexus/content/repositories/dnet45-releases
- default
-
- false
-
-
- true
-
-
-
- dnet45-snapshots
- D-Net 45 snapshots
- https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
- default
-
- true
-
-
- false
-
-
-
- dnet45-bootstrap-snapshot
- D-Net 45 Bootstrap Snapshot
- https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/
-
- false
-
-
- true
-
- default
-
-
- dnet45-bootstrap-release
- D-Net 45 Bootstrap Release
- https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/
-
- true
-
-
- false
-
- default
-
-
- cloudera
- Cloudera Repository
- https://repository.cloudera.com/artifactory/cloudera-repos
-
- true
-
-
- false
-
-
-
- dnet-deps
- dnet-dependencies
- https://maven.d4science.org/nexus/content/repositories/dnet-deps
- default
-
-
- maven-restlet
- Restlet repository
- https://maven.restlet.talend.com
-
-
- conjars
- conjars
- https://conjars.wensel.net/repo/
-
-
-
-
-
- org.junit.jupiter
- junit-jupiter
- ${junit-jupiter.version}
- test
-
-
-
- org.mockito
- mockito-core
- ${mockito-core.version}
- test
-
-
-
- org.mockito
- mockito-junit-jupiter
- ${mockito-core.version}
- test
-
-
-
-
-
-
-
- eu.dnetlib.dhp
- ${dhp-schemas.artifact}
- ${dhp-schemas.version}
-
-
- org.apache.hadoop
- hadoop-hdfs
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-common
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-client
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.hadoop
- hadoop-distcp
- ${dhp.hadoop.version}
- provided
-
-
- org.apache.spark
- spark-core_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-sql_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-graphx_${scala.binary.version}
- ${dhp.spark.version}
- provided
-
-
- org.apache.spark
- spark-hive_${scala.binary.version}
- ${dhp.spark.version}
- test
-
-
-
- org.slf4j
- jcl-over-slf4j
- 1.7.25
- provided
-
-
-
- org.apache.commons
- commons-lang3
- ${dhp.commons.lang.version}
-
-
-
- commons-validator
- commons-validator
- 1.7
-
-
-
- com.github.sisyphsu
- dateparser
- 1.0.7
-
-
-
- me.xuender
- unidecode
- 0.0.7
-
-
-
- com.google.guava
- guava
- ${dhp.guava.version}
-
-
-
-
- commons-codec
- commons-codec
- 1.9
-
-
-
- commons-io
- commons-io
- 2.4
-
-
-
- commons-cli
- commons-cli
- 1.2
- provided
-
-
-
- net.sf.saxon
- Saxon-HE
- 9.9.1-6
-
-
-
- dom4j
- dom4j
- 1.6.1
-
-
-
- xml-apis
- xml-apis
- 1.4.01
-
-
-
- jaxen
- jaxen
- 1.1.6
-
-
-
- com.mycila.xmltool
- xmltool
- 3.3
-
-
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- *
- *
-
-
-
-
- com.lucidworks.spark
- spark-solr
- ${sparksolr.version}
-
-
- *
- *
-
-
-
-
- org.apache.solr
- solr-test-framework
- ${solr.version}
- test
-
-
- io.dropwizard.metrics
- metrics-core
- 3.2.6
- test
-
-
-
-
- org.apache.httpcomponents
- httpclient
- ${org.apache.httpcomponents.version}
-
-
- org.apache.httpcomponents
- httpmime
- ${org.apache.httpcomponents.version}
-
-
- org.noggit
- noggit
- 0.8
-
-
- org.apache.zookeeper
- zookeeper
- 3.4.11
-
-
-
- net.schmizz
- sshj
- 0.10.0
- test
-
-
-
- com.fasterxml.jackson.core
- jackson-core
- ${dhp.jackson.version}
- provided
-
-
-
- com.fasterxml.jackson.core
- jackson-annotations
- ${dhp.jackson.version}
- provided
-
-
- com.fasterxml.jackson.core
- jackson-databind
- ${dhp.jackson.version}
- provided
-
-
-
- eu.dnetlib
- dnet-actionmanager-common
- ${dnet-actionmanager-common.version}
-
-
- org.apache.hadoop
- hadoop-common
-
-
-
-
- eu.dnetlib
- dnet-actionmanager-api
- ${dnet-actionmanager-api.version}
-
-
- eu.dnetlib
- cnr-misc-utils
-
-
-
-
-
- eu.dnetlib
- cnr-rmi-api
- ${cnr-rmi-api.version}
-
-
-
- eu.dnetlib.dhp
- dnet-openaire-broker-common
- ${dnet-openaire-broker-common.version}
-
-
-
- org.apache.cxf
- cxf-rt-transports-http
- 3.1.5
-
-
- javax.persistence
- javax.persistence-api
- 2.2
- provided
-
-
-
- com.jayway.jsonpath
- json-path
- 2.4.0
-
-
- com.arakelian
- java-jq
- 0.10.1
-
-
- edu.cmu
- secondstring
- 1.0.0
-
-
- org.mongodb
- mongo-java-driver
- ${mongodb.driver.version}
-
-
- io.fares.junit.mongodb
- mongodb-junit-test
- 1.1.0
-
-
- org.postgresql
- postgresql
- 42.2.10
-
-
-
- org.antlr
- stringtemplate
- 3.2.1
-
-
-
- org.antlr
- ST4
- 4.3.4
-
-
-
- com.ximpleware
- vtd-xml
- ${vtd.version}
-
-
-
- org.elasticsearch
- elasticsearch-hadoop
- 7.6.0
-
-
-
-
- org.apache.oozie
- oozie-client
- ${dhp.oozie.version}
- provided
-
-
-
- slf4j-simple
- org.slf4j
-
-
-
-
-
-
- com.squareup.okhttp3
- okhttp
- ${okhttp.version}
-
-
-
- org.apache.commons
- commons-compress
- ${common.compress.version}
-
-
-
-
- org.apache.commons
- commons-csv
- ${common.csv.version}
-
-
-
-
-
- org.apache.poi
- poi-ooxml
- ${apache.poi.version}
-
-
-
- org.json
- json
- 20180813
-
-
-
- org.json4s
- json4s-jackson_${scala.binary.version}
- ${json4s.version}
-
-
-
- com.github.victools
- jsonschema-generator
- ${jsonschemagenerator.version}
-
-
-
- org.apache.commons
- commons-text
- ${common.text.version}
-
-
-
- com.opencsv
- opencsv
- 5.5
-
-
- io.github.classgraph
- classgraph
- 4.8.71
-
-
-
- com.fasterxml.jackson.dataformat
- jackson-dataformat-xml
- ${jackson.version}
- provided
-
-
- com.fasterxml.jackson.module
- jackson-module-jsonSchema
- ${jackson.version}
- provided
-
-
-
-
- org.apache.commons
- commons-math3
- 3.6.1
-
-
-
-
- com.google.code.gson
- gson
- ${google.gson.version}
-
-
-
- commons-collections
- commons-collections
- ${commons.collections.version}
-
-
- commons-logging
- commons-logging
- ${commons.logging.version}
-
-
-
- org.reflections
- reflections
- 0.9.10
-
-
-
- org.scala-lang
- scala-library
- ${scala.version}
-
-
-
- com.ibm.icu
- icu4j
- 70.1
-
-
-
-
-
- target
- target/classes
- ${project.artifactId}-${project.version}
- target/test-classes
-
-
-
- org.apache.maven.plugins
- maven-plugin-plugin
- 3.3
-
-
- org.apache.maven.plugins
- maven-project-info-reports-plugin
- 3.0.0
-
-
- org.apache.maven.plugins
- maven-site-plugin
- 3.9.1
-
- ${dhp.site.skip}
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- ${maven.compiler.plugin.version}
-
-
- 1.8
- ${project.build.sourceEncoding}
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 3.0.2
-
-
-
- org.apache.maven.plugins
- maven-source-plugin
- 3.0.1
-
-
- attach-sources
- verify
-
- jar-no-fork
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M4
-
- true
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
-
- true
- none
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
- 3.6.0
-
-
-
- net.revelc.code.formatter
- formatter-maven-plugin
- 2.11.0
-
-
- eu.dnetlib.dhp
- dhp-code-style
- ${project.version}
-
-
-
-
- org.antipathy
- mvn-scalafmt_${scala.binary.version}
- 1.0.1640073709.733712b
-
-
- eu.dnetlib.dhp
- dhp-code-style
- ${project.version}
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-site-plugin
-
-
- org.apache.maven.plugins
- maven-project-info-reports-plugin
-
-
- net.revelc.code.formatter
- formatter-maven-plugin
-
-
-
- format
-
-
- eclipse/formatter_dnet.xml
-
-
-
-
-
- net.revelc.code
- impsort-maven-plugin
- 1.4.1
-
- java.,javax.,org.,com.
- java,*
-
- **/thrift/*.java
-
-
-
-
- sort-imports
-
- sort
-
-
-
-
-
- org.antipathy
- mvn-scalafmt_${scala.binary.version}
-
- https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
- false
- false
-
- ${project.basedir}/src/main/scala
-
-
- ${project.basedir}/src/test/scala
-
- false
- false
- : git rev-parse --abbrev-ref HEAD
- false
-
-
-
- validate
-
- format
-
-
-
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 2.5.3
-
-
- org.jacoco
- jacoco-maven-plugin
- 0.7.9
-
-
- **/schemas/*
- **/com/cloudera/**/*
- **/org/apache/avro/io/**/*
-
-
-
-
- default-prepare-agent
-
- prepare-agent
-
-
-
- default-report
- prepare-package
-
- report
-
-
-
-
-
-
-
-
-
- org.apache.maven.wagon
- wagon-ssh
- 2.10
-
-
-
-
-
- dnet45-snapshots
- DNet45 Snapshots
- https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
- default
-
-
- dnet45-releases
- https://maven.d4science.org/nexus/content/repositories/dnet45-releases
-
-
- DHPSite
- ${dhp.site.stage.path}/
-
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
-
- true
- none
-
-
-
-
-
-
- sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop
- UTF-8
- UTF-8
- 3.6.0
- 1.8
- 1.8
- 2.22.2
- 2.0.1
- cdh5.9.2
- 2.6.0-${dhp.cdh.version}
- 4.1.0-${dhp.cdh.version}
- dhp-schemas
- 3.6.0
- 2.4.0.cloudera2
- 2.9.6
- 3.5
- true
- 11.0.2
- 2.11.12
- 2.11
- 1.3.0
- 5.6.1
- 3.3.3
- 3.4.2
- [2.12,3.0)
- [6.1.2]
- [4.0.3]
- [6.0.5]
- [3.1.6]
- [2.6.1]
- 7.5.0
- 4.7.2
- 1.20
- 3.5.3
- 4.13.0
- 1.8
- 4.1.2
- 1.8
- 4.5.3
- 4.0.1
- 2.2.2
- 1.1.3
- 3.2.1
-
-
-
-
-
- scala-2.12
-
- 2.12
- 2.12.18
-
-
-
- 4.0.2
- 3.4.1
- 2.14.2
- 3.12.0
- 3.7.0-M11
- 4.8.1
-
-
-
-
-
-
-
- arm-silicon-mac
-
-
- aarch64
- mac
-
-
-
-
-
- org.xerial.snappy
- snappy-java
- 1.1.8.4
-
-
-
-
+ 4.0.0
+ eu.dnetlib.dhp
+ dhp
+ 1.2.5-SNAPSHOT
+ pom
+
+
+
+ GNU Affero General Public License v3.0 or later
+ https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText
+ repo
+ This program is free software: you can redistribute it and/or modify it under the terms of the
+ GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+
+
+
+
+ dhp-build
+ dhp-pace-core
+ dhp-common
+ dhp-workflows
+ dhp-shade-package
+
+
+
+ Redmine
+ https://support.openaire.eu/projects/openaire
+
+
+
+ jenkins
+ https://jenkins-dnet.d4science.org/
+
+
+
+ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
+ scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git
+ https://code-repo.d4science.org/D-Net/dnet-hadoop/
+ HEAD
+
+
+ This module is the root descriptor for the dnet-hadoop project
+
+
+
+
+
+
+
+ Openaire-third-parties-snaphot
+ Openaire third parties Snapshot
+ https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/
+
+ false
+
+
+ true
+
+
+
+
+ dnet45-releases
+ D-Net 45 releases
+ https://maven.d4science.org/nexus/content/repositories/dnet45-releases
+ default
+
+ false
+
+
+ true
+
+
+
+ dnet45-snapshots
+ D-Net 45 snapshots
+ https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
+ default
+
+ true
+
+
+ false
+
+
+
+ dnet45-bootstrap-snapshot
+ D-Net 45 Bootstrap Snapshot
+ https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/
+
+ false
+
+
+ true
+
+ default
+
+
+ dnet45-bootstrap-release
+ D-Net 45 Bootstrap Release
+ https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/
+
+ true
+
+
+ false
+
+ default
+
+
+ cloudera
+ Cloudera Repository
+ https://repository.cloudera.com/artifactory/cloudera-repos
+
+ true
+
+
+ false
+
+
+
+ dnet-deps
+ dnet-dependencies
+ https://maven.d4science.org/nexus/content/repositories/dnet-deps
+ default
+
+
+ maven-restlet
+ Restlet repository
+ https://maven.restlet.talend.com
+
+
+ conjars
+ conjars
+ https://conjars.wensel.net/repo/
+
+
+
+
+
+
+ org.projectlombok
+ lombok
+ 1.18.28
+ provided
+
+
+ org.junit.jupiter
+ junit-jupiter
+ ${junit-jupiter.version}
+ test
+
+
+
+ org.mockito
+ mockito-core
+ ${mockito-core.version}
+ test
+
+
+
+ org.mockito
+ mockito-junit-jupiter
+ ${mockito-core.version}
+ test
+
+
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${dhp-schemas.version}
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-distcp
+ ${dhp.hadoop.version}
+ provided
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-graphx_${scala.binary.version}
+ ${dhp.spark.version}
+ provided
+
+
+ org.apache.spark
+ spark-hive_${scala.binary.version}
+ ${dhp.spark.version}
+ test
+
+
+
+ org.slf4j
+ slf4j-api
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.slf4j
+ slf4j-log4j12
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.slf4j
+ jcl-over-slf4j
+ ${org.slf4j.version}
+ provided
+
+
+
+ org.apache.logging.log4j
+ log4j-slf4j2-impl
+ ${log4j.version}
+
+
+ org.apache.logging.log4j
+ log4j-api
+ ${log4j.version}
+
+
+ org.apache.logging.log4j
+ log4j-core
+ ${log4j.version}
+
+
+
+ org.apache.logging.log4j
+ log4j-1.2-api
+ ${log4j.version}
+
+
+
+ org.apache.commons
+ commons-lang3
+ ${dhp.commons.lang.version}
+
+
+
+ org.apache.commons
+ commons-beanutils
+ ${commons-beanutils.version}
+
+
+
+
+ commons-validator
+ commons-validator
+ ${commons-validator.version}
+
+
+
+ com.github.sisyphsu
+ dateparser
+ ${dateparser.version}
+
+
+
+ me.xuender
+ unidecode
+ ${unidecode.version}
+
+
+
+ com.google.guava
+ guava
+ ${dhp.guava.version}
+
+
+
+
+ commons-codec
+ commons-codec
+ ${commons-codec.version}
+
+
+
+ commons-io
+ commons-io
+ ${commons-io.version}
+
+
+
+ commons-cli
+ commons-cli
+ 1.2
+ provided
+
+
+
+ net.sf.saxon
+ Saxon-HE
+ 9.9.1-6
+
+
+
+ dom4j
+ dom4j
+ 1.6.1
+
+
+
+ xml-apis
+ xml-apis
+ 1.4.01
+
+
+
+ jaxen
+ jaxen
+ 1.1.6
+
+
+
+ com.mycila.xmltool
+ xmltool
+ 3.3
+
+
+
+ org.apache.solr
+ solr-solrj
+ ${solr.version}
+
+
+ *
+ *
+
+
+
+
+ com.lucidworks.spark
+ spark-solr
+ ${sparksolr.version}
+
+
+ *
+ *
+
+
+
+
+ org.apache.solr
+ solr-test-framework
+ ${solr.version}
+ test
+
+
+ io.dropwizard.metrics
+ metrics-core
+ 3.2.6
+ test
+
+
+
+
+ org.apache.httpcomponents
+ httpclient
+ ${org.apache.httpcomponents.version}
+
+
+ org.apache.httpcomponents
+ httpmime
+ ${org.apache.httpcomponents.version}
+
+
+ org.noggit
+ noggit
+ 0.8
+
+
+ org.apache.zookeeper
+ zookeeper
+ ${zookeeper.version}
+
+
+
+ net.schmizz
+ sshj
+ 0.10.0
+ test
+
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ ${dhp.jackson.version}
+ provided
+
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ ${dhp.jackson.version}
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${dhp.jackson.version}
+ provided
+
+
+
+ eu.dnetlib
+ dnet-actionmanager-common
+ ${dnet-actionmanager-common.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+
+
+
+
+ eu.dnetlib
+ dnet-actionmanager-api
+ ${dnet-actionmanager-api.version}
+
+
+ eu.dnetlib
+ cnr-misc-utils
+
+
+
+
+
+ eu.dnetlib
+ cnr-rmi-api
+ ${cnr-rmi-api.version}
+
+
+
+ eu.dnetlib.dhp
+ dnet-openaire-broker-common
+ ${dnet-openaire-broker-common.version}
+
+
+
+ org.apache.cxf
+ cxf-rt-transports-http
+ 3.1.5
+
+
+
+ javax.persistence
+ javax.persistence-api
+ 2.2
+ provided
+
+
+
+ com.jayway.jsonpath
+ json-path
+ 2.4.0
+
+
+ com.arakelian
+ java-jq
+ 0.10.1
+
+
+ edu.cmu
+ secondstring
+ 1.0.0
+
+
+ org.mongodb
+ mongo-java-driver
+ ${mongodb.driver.version}
+
+
+ io.fares.junit.mongodb
+ mongodb-junit-test
+ 1.1.0
+
+
+ org.postgresql
+ postgresql
+ 42.2.10
+
+
+
+ org.antlr
+ stringtemplate
+ 3.2.1
+
+
+
+ org.antlr
+ ST4
+ 4.3.4
+
+
+
+ com.ximpleware
+ vtd-xml
+ ${vtd.version}
+
+
+
+ org.elasticsearch
+ elasticsearch-hadoop
+ 7.6.0
+
+
+
+
+ org.apache.oozie
+ oozie-client
+ ${dhp.oozie.version}
+ provided
+
+
+
+ slf4j-simple
+ org.slf4j
+
+
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ ${okhttp.version}
+
+
+
+ org.apache.commons
+ commons-compress
+ ${common.compress.version}
+
+
+ org.apache.commons
+ commons-csv
+ ${common.csv.version}
+
+
+ org.apache.poi
+ poi-ooxml
+ ${apache.poi.version}
+
+
+
+ org.json
+ json
+ 20180813
+
+
+
+ org.json4s
+ json4s-jackson_${scala.binary.version}
+ ${json4s.version}
+
+
+
+ com.github.victools
+ jsonschema-generator
+ ${jsonschemagenerator.version}
+
+
+
+ org.apache.commons
+ commons-text
+ ${common.text.version}
+
+
+
+ com.opencsv
+ opencsv
+ 5.5
+
+
+ io.github.classgraph
+ classgraph
+ 4.8.71
+
+
+
+ com.fasterxml.jackson.dataformat
+ jackson-dataformat-xml
+ ${jackson.version}
+ provided
+
+
+ com.fasterxml.jackson.module
+ jackson-module-jsonSchema
+ ${jackson.version}
+ provided
+
+
+
+ org.apache.commons
+ commons-math3
+ 3.6.1
+
+
+
+ com.google.code.gson
+ gson
+ ${google.gson.version}
+
+
+
+ commons-collections
+ commons-collections
+ ${commons.collections.version}
+
+
+ commons-logging
+ commons-logging
+ ${commons.logging.version}
+
+
+
+ org.reflections
+ reflections
+ ${reflections.version}
+
+
+
+ org.scala-lang
+ scala-library
+ ${scala.version}
+
+
+
+ com.ibm.icu
+ icu4j
+ 70.1
+
+
+
+ org.javassist
+ javassist
+ ${javassist.version}
+
+
+
+
+
+ target
+ target/classes
+ ${project.artifactId}-${project.version}
+ target/test-classes
+
+
+
+ org.apache.maven.plugins
+ maven-plugin-plugin
+ 3.3
+
+
+ org.apache.maven.plugins
+ maven-project-info-reports-plugin
+ 3.0.0
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+ 3.9.1
+
+ ${dhp.site.skip}
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ ${maven.compiler.plugin.version}
+
+
+ 1.8
+ ${project.build.sourceEncoding}
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.0.2
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.0.1
+
+
+ attach-sources
+ verify
+
+ jar-no-fork
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M4
+
+ true
+ false
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.2.0
+
+ true
+ none
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.6.0
+
+
+
+ net.revelc.code.formatter
+ formatter-maven-plugin
+ 2.11.0
+
+
+ eu.dnetlib.dhp
+ dhp-code-style
+ ${project.version}
+
+
+
+
+ org.antipathy
+ mvn-scalafmt_${scala.binary.version}
+ 1.0.1640073709.733712b
+
+
+ eu.dnetlib.dhp
+ dhp-code-style
+ ${project.version}
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-site-plugin
+
+
+ org.apache.maven.plugins
+ maven-project-info-reports-plugin
+
+
+ net.revelc.code.formatter
+ formatter-maven-plugin
+
+
+
+ format
+
+
+ eclipse/formatter_dnet.xml
+
+
+
+
+
+ net.revelc.code
+ impsort-maven-plugin
+ 1.6.2
+
+ java.,javax.,org.,com.
+ java,*
+
+ **/thrift/*.java
+
+
+
+
+ sort-imports
+
+ sort
+
+
+
+
+
+ org.antipathy
+ mvn-scalafmt_${scala.binary.version}
+
+
+ https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
+
+ false
+ false
+
+ ${project.basedir}/src/main/scala
+
+
+ ${project.basedir}/src/test/scala
+
+ false
+ false
+ : git rev-parse --abbrev-ref HEAD
+ false
+
+
+
+ validate
+
+ format
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-release-plugin
+ 2.5.3
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.10
+
+
+ **/schemas/*
+ **/com/cloudera/**/*
+ **/org/apache/avro/io/**/*
+
+
+
+
+ default-prepare-agent
+
+ prepare-agent
+
+
+
+ default-report
+ prepare-package
+
+ report
+
+
+
+
+
+
+
+
+
+ org.apache.maven.wagon
+ wagon-ssh
+ 2.10
+
+
+
+
+
+ dnet45-snapshots
+ DNet45 Snapshots
+ https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots
+ default
+
+
+ dnet45-releases
+ https://maven.d4science.org/nexus/content/repositories/dnet45-releases
+
+
+ DHPSite
+ ${dhp.site.stage.path}/
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+ true
+ none
+
+
+
+
+
+
+ sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop
+ UTF-8
+ UTF-8
+ 1.8
+ 1.8
+
+
+ 2.11.12
+ 2.11
+
+
+ 3.6.0
+ 2.22.2
+ 2.0.1
+ 4.0.1
+
+
+ 4.1.2
+ [2.6.1]
+ 1.20
+ 1.8
+ 1.8
+ 1.9.4
+ 1.9
+ 3.2.1
+ 2.4
+ 1.1.3
+ 1.7
+ 1.0.7
+ [6.1.2]
+ cdh5.9.2
+ 3.5
+ 11.0.2
+ 2.6.0-${dhp.cdh.version}
+ 2.9.6
+ 4.1.0-${dhp.cdh.version}
+ true
+ 2.4.0.cloudera2
+ [4.0.3]
+ [6.0.5]
+ [3.1.6]
+ 2.2.2
+ 1.2.17
+ 3.19.0-GA
+ 3.5.3
+ 4.13.0
+ 5.6.1
+ 3.3.3
+ 3.4.2
+ 4.7.2
+ 4.5.3
+ 1.7.25
+ 0.9.10
+ 1.3.0
+ 7.5.0
+ 3.6.0
+ 0.0.7
+ [2.12,3.0)
+ 3.4.6
+
+
+
+
+
+ spark-34
+
+ 2.12
+ 2.12.18
+ 1.3.0
+
+
+ 4.8.1
+
+
+ 1.22
+ 1.8
+ 1.10.0
+ 1.9.4
+ 1.15
+ 3.2.2
+ 2.11.0
+ 1.1.3
+ 1.7
+
+ 14.0.1
+ 8.11.0
+ 4.0.4
+ 3.4.2.openaire
+ 2.14.2
+ 3.12.0
+ 2.19.0
+ 3.7.0-M11
+ 3.25.0-GA
+ 4.10.0
+ 2.0.6
+ 0.10.2
+ 3.6.3
+
+
+
+
+ spark-35
+
+ 2.12
+ 2.12.18
+ 1.3.0
+
+
+ 4.8.1
+
+
+ 1.23.0
+ 1.8
+ 1.10.0
+ 1.9.4
+ 1.16.0
+ 3.2.2
+ 2.13.0
+ 1.1.3
+ 1.7
+
+ 14.0.1
+ 8.11.0
+ 4.0.4
+ 3.5.1.openaire-SNAPSHOT
+ 2.15.2
+ 3.12.0
+ 2.20.0
+ 3.7.0-M11
+ 3.25.0-GA
+ 4.10.0
+ 2.0.7
+ 0.10.2
+ 3.6.3
+
+
+
+
+ java11
+
+ [11
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M4
+
+
+ --add-opens=java.base/java.lang=ALL-UNNAMED
+ --add-opens=java.base/java.lang.invoke=ALL-UNNAMED
+ --add-opens=java.base/java.lang.reflect=ALL-UNNAMED
+ --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
+ --add-opens=java.base/java.nio=ALL-UNNAMED
+ --add-opens=java.base/java.util=ALL-UNNAMED
+ --add-opens=java.base/java.util.concurrent=ALL-UNNAMED
+ --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
+ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
+ --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
+ --add-opens=java.base/sun.security.action=ALL-UNNAMED
+ --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
+
+ true
+ false
+
+
+
+
+
+
\ No newline at end of file