forked from D-Net/dnet-hadoop
Merge branch 'beta' into dblp_collection_plugin
This commit is contained in:
commit
05ee783c07
|
@ -27,3 +27,4 @@ spark-warehouse
|
||||||
/**/.factorypath
|
/**/.factorypath
|
||||||
/**/.scalafmt.conf
|
/**/.scalafmt.conf
|
||||||
/.java-version
|
/.java-version
|
||||||
|
/dhp-shade-package/dependency-reduced-pom.xml
|
||||||
|
|
|
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
|
||||||
mojo.outputFile = testFolder;
|
mojo.outputFile = testFolder;
|
||||||
|
|
||||||
// execute
|
// execute
|
||||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
try {
|
||||||
|
mojo.execute();
|
||||||
|
Assertions.assertTrue(false); // not reached
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assertions
|
||||||
|
.assertTrue(
|
||||||
|
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
|
||||||
|
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -70,10 +70,7 @@
|
||||||
<groupId>com.ibm.icu</groupId>
|
<groupId>com.ibm.icu</groupId>
|
||||||
<artifactId>icu4j</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.hadoop</groupId>
|
|
||||||
<artifactId>hadoop-common</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
@ -163,7 +160,7 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -172,4 +169,23 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<!-- dependencies required on JDK9+ because J2EE has been removed -->
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>javax.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-api</artifactId>
|
||||||
|
<version>2.2.11</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.sun.xml.ws</groupId>
|
||||||
|
<artifactId>jaxws-ri</artifactId>
|
||||||
|
<version>2.3.3</version>
|
||||||
|
<type>pom</type>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PacePerson {
|
||||||
PacePerson.class
|
PacePerson.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
|
||||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||||
* concept_rec_id = 656930
|
* concept_rec_id = 656930
|
||||||
* @return response code
|
* @return response code
|
||||||
* @throws IOException
|
|
||||||
* @throws MissingConceptDoiException
|
|
||||||
*/
|
*/
|
||||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||||
setDepositionId(concept_rec_id, 1);
|
setDepositionId(concept_rec_id, 1);
|
||||||
|
|
|
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.apache.commons.lang3.time.DateUtils;
|
|
||||||
import org.apache.http.HttpHeaders;
|
import org.apache.http.HttpHeaders;
|
||||||
import org.joda.time.Instant;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -154,5 +154,13 @@
|
||||||
"unknown":{
|
"unknown":{
|
||||||
"original":"Unknown",
|
"original":"Unknown",
|
||||||
"inverse":"Unknown"
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
log.info(s"Creating Spark session: Master: $master")
|
log.info(s"Creating Spark session: Master: $master")
|
||||||
SparkSession
|
val b = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(master)
|
if (master != null)
|
||||||
.getOrCreate()
|
b.master(master)
|
||||||
|
b.getOrCreate()
|
||||||
}
|
}
|
||||||
|
|
||||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||||
|
|
|
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||||
|
val sum = ScholixUtils.resultToSummary(r)
|
||||||
|
if (sum != null)
|
||||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||||
|
else
|
||||||
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||||
|
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||||
|
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
|
||||||
if (persistentIdentifiers.isEmpty)
|
if (persistentIdentifiers.isEmpty)
|
||||||
return null
|
return null
|
||||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||||
if (r.isInstanceOf[Publication])
|
// s.setTypology(r.getResulttype.getClassid)
|
||||||
s.setTypology(Typology.publication)
|
|
||||||
else
|
|
||||||
s.setTypology(Typology.dataset)
|
|
||||||
|
|
||||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -59,14 +59,6 @@
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.code.gson</groupId>
|
|
||||||
<artifactId>gson</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -91,10 +83,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-math3</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
@ -113,4 +101,90 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-35</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,12 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.common;
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
*
|
*
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
|
||||||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
|
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||||
|
|
||||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||||
}
|
}
|
||||||
|
|
||||||
def rowFromJson(json: String): Row = {
|
def rowFromJson(json: String): Row = {
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
RowEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
ExpressionEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class UtilTest {
|
public class UtilTest {
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,169 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp</artifactId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>dhp-shade-package</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
|
<description>This module create a jar of all module dependencies</description>
|
||||||
|
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-mapper</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-provision</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-impact-indicators</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-actionsets</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-promote</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-swh</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-stats-build</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-shade-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>shade</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<transformers>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
|
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||||
|
</transformer>
|
||||||
|
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||||
|
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||||
|
</transformer>
|
||||||
|
</transformers>
|
||||||
|
<filters>
|
||||||
|
<filter>
|
||||||
|
<artifact>*:*</artifact>
|
||||||
|
<excludes>
|
||||||
|
<exclude>META-INF/maven/**</exclude>
|
||||||
|
<exclude>META-INF/*.SF</exclude>
|
||||||
|
<exclude>META-INF/*.DSA</exclude>
|
||||||
|
<exclude>META-INF/*.RSA</exclude>
|
||||||
|
</excludes>
|
||||||
|
</filter>
|
||||||
|
</filters>
|
||||||
|
<relocations>
|
||||||
|
<relocation>
|
||||||
|
<pattern>com</pattern>
|
||||||
|
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||||
|
<includes>
|
||||||
|
<include>com.google.common.**</include>
|
||||||
|
</includes>
|
||||||
|
</relocation>
|
||||||
|
</relocations>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -9,6 +9,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
resultsRDD
|
resultsRDD
|
||||||
.union(projectsRDD)
|
.union(projectsRDD)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String blackListInputPath = parser.get("blackListPath");
|
||||||
|
log.info("blackListInputPath: {}", blackListInputPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -77,19 +81,25 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
createActionSet(spark, inputPath, outputPath);
|
createActionSet(spark, inputPath, outputPath, blackListInputPath);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void createActionSet(SparkSession spark, String inputPath,
|
public static void createActionSet(SparkSession spark, String inputPath,
|
||||||
String outputPath) {
|
String outputPath, String blackListInputPath) {
|
||||||
|
|
||||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
||||||
.filter("publication_year <= 2020 or country_code=='IE'")
|
.filter("country_code=='IE'")
|
||||||
.drop("publication_year");
|
.drop("publication_year");
|
||||||
|
|
||||||
dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
||||||
|
|
||||||
|
dataset
|
||||||
|
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
||||||
|
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
||||||
|
.drop("OpenAlexId")
|
||||||
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
List<Relation> ret = new ArrayList<>();
|
List<Relation> ret = new ArrayList<>();
|
||||||
final String ror = ROR_PREFIX
|
final String ror = ROR_PREFIX
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
||||||
|
@ -136,6 +146,15 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||||
|
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.option("header", true)
|
||||||
|
.csv(inputPath)
|
||||||
|
.select("OpenAlexId");
|
||||||
|
}
|
||||||
|
|
||||||
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
||||||
if (pmcid == null)
|
if (pmcid == null)
|
||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Spliterator;
|
import java.util.Spliterator;
|
||||||
import java.util.Spliterators;
|
import java.util.Spliterators;
|
||||||
|
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
final String entityXpath = api.getParams().get("entityXpath");
|
final String entityXpath = api.getParams().get("entityXpath");
|
||||||
final String authMethod = api.getParams().get("authMethod");
|
final String authMethod = api.getParams().get("authMethod");
|
||||||
final String authToken = api.getParams().get("authToken");
|
final String authToken = api.getParams().get("authToken");
|
||||||
|
final String requestHeaderMap = api.getParams().get("requestHeaderMap");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
|
||||||
final String resultSizeValue = Optional
|
final String resultSizeValue = Optional
|
||||||
.ofNullable(api.getParams().get("resultSizeValue"))
|
.ofNullable(api.getParams().get("resultSizeValue"))
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
if (StringUtils.isBlank(resultFormatValue)) {
|
if (StringUtils.isBlank(resultFormatValue)) {
|
||||||
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
||||||
}
|
}
|
||||||
if (StringUtils.isBlank(queryParams)) {
|
|
||||||
throw new CollectorException("Param 'queryParams' is null or empty");
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(entityXpath)) {
|
if (StringUtils.isBlank(entityXpath)) {
|
||||||
throw new CollectorException("Param 'entityXpath' is null or empty");
|
throw new CollectorException("Param 'entityXpath' is null or empty");
|
||||||
}
|
}
|
||||||
|
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
entityXpath,
|
entityXpath,
|
||||||
authMethod,
|
authMethod,
|
||||||
authToken,
|
authToken,
|
||||||
resultOutputFormat);
|
resultOutputFormat,
|
||||||
|
requestHeaders);
|
||||||
|
|
||||||
return StreamSupport
|
return StreamSupport
|
||||||
.stream(
|
.stream(
|
||||||
|
|
|
@ -9,6 +9,7 @@ import java.net.URL;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
|
@ -34,6 +35,8 @@ import org.w3c.dom.Node;
|
||||||
import org.w3c.dom.NodeList;
|
import org.w3c.dom.NodeList;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
@ -55,7 +58,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
private final HttpClientParams clientParams;
|
||||||
|
|
||||||
private final String BASIC = "basic";
|
private final String AUTHBASIC = "basic";
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final String resumptionType;
|
private final String resumptionType;
|
||||||
|
@ -89,6 +92,11 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
private final String resultOutputFormat;
|
private final String resultOutputFormat;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Can be used to set additional request headers, like for content negotiation
|
||||||
|
*/
|
||||||
|
private Map<String, String> requestHeaders;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* RestIterator class compatible to version 1.3.33
|
* RestIterator class compatible to version 1.3.33
|
||||||
*/
|
*/
|
||||||
|
@ -107,7 +115,8 @@ public class RestIterator implements Iterator<String> {
|
||||||
final String entityXpath,
|
final String entityXpath,
|
||||||
final String authMethod,
|
final String authMethod,
|
||||||
final String authToken,
|
final String authToken,
|
||||||
final String resultOutputFormat) {
|
final String resultOutputFormat,
|
||||||
|
final Map<String, String> requestHeaders) {
|
||||||
|
|
||||||
this.clientParams = clientParams;
|
this.clientParams = clientParams;
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
|
@ -119,6 +128,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
this.authMethod = authMethod;
|
this.authMethod = authMethod;
|
||||||
this.authToken = authToken;
|
this.authToken = authToken;
|
||||||
this.resultOutputFormat = resultOutputFormat;
|
this.resultOutputFormat = resultOutputFormat;
|
||||||
|
this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
|
||||||
|
|
||||||
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||||
: "";
|
: "";
|
||||||
|
@ -231,25 +241,20 @@ public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
final URL qUrl = new URL(query);
|
final URL qUrl = new URL(query);
|
||||||
log.debug("authMethod: {}", this.authMethod);
|
log.debug("authMethod: {}", this.authMethod);
|
||||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
if (this.authMethod == "bearer") {
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
requestHeaders.put("Authorization", "Bearer " + authToken);
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
|
// requestHeaders.put("Content-Type", "application/json");
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
conn.setRequestMethod("GET");
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
theHttpInputStream = conn.getInputStream();
|
requestHeaders.put("Authorization", "Basic " + authToken);
|
||||||
} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
// requestHeaders.put("accept", "application/xml");
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
|
||||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
|
|
||||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
theHttpInputStream = conn.getInputStream();
|
|
||||||
} else {
|
|
||||||
theHttpInputStream = qUrl.openStream();
|
|
||||||
}
|
}
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
this.setRequestHeader(conn);
|
||||||
|
resultStream = conn.getInputStream();
|
||||||
|
|
||||||
this.resultStream = theHttpInputStream;
|
|
||||||
if ("json".equals(this.resultOutputFormat)) {
|
if ("json".equals(this.resultOutputFormat)) {
|
||||||
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
||||||
resultXml = JsonUtils.convertToXML(resultJson);
|
resultXml = JsonUtils.convertToXML(resultJson);
|
||||||
|
@ -380,7 +385,8 @@ public class RestIterator implements Iterator<String> {
|
||||||
try {
|
try {
|
||||||
if (this.resultTotal == -1) {
|
if (this.resultTotal == -1) {
|
||||||
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
||||||
if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
if ("page".equalsIgnoreCase(this.resumptionType)
|
||||||
|
&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
this.resultTotal += 1;
|
this.resultTotal += 1;
|
||||||
} // to correct the upper bound
|
} // to correct the upper bound
|
||||||
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
||||||
|
@ -433,6 +439,22 @@ public class RestIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* setRequestHeader
|
||||||
|
*
|
||||||
|
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
|
||||||
|
* @param conn
|
||||||
|
*/
|
||||||
|
private void setRequestHeader(HttpURLConnection conn) {
|
||||||
|
if (requestHeaders != null) {
|
||||||
|
for (String key : requestHeaders.keySet()) {
|
||||||
|
conn.setRequestProperty(key, requestHeaders.get(key));
|
||||||
|
}
|
||||||
|
log.debug("Set Request Header with: " + requestHeaders);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public String getResultFormatValue() {
|
public String getResultFormatValue() {
|
||||||
return this.resultFormatValue;
|
return this.resultFormatValue;
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,5 +16,10 @@
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
}
|
},{
|
||||||
|
"paramName": "bl",
|
||||||
|
"paramLongName": "blackListPath",
|
||||||
|
"paramDescription": "the working path",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||||
outputPath=/tmp/miriam/webcrawlComplete/
|
outputPath=/tmp/miriam/webcrawlComplete/
|
||||||
|
blackListPath=/user/miriam.baglioni/openalex-blackList
|
||||||
|
|
|
@ -45,6 +45,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -625,12 +625,6 @@
|
||||||
"name": "Alimentary Health",
|
"name": "Alimentary Health",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100011103",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100011103",
|
|
||||||
"name": "Rann\u00eds",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100012354",
|
"id": "501100012354",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100012354",
|
"uri": "http://dx.doi.org/10.13039/501100012354",
|
||||||
|
|
|
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
|
||||||
tp._1 match {
|
tp._1 match {
|
||||||
case "electronic" => journal.setIssnOnline(tp._2)
|
case "electronic" => journal.setIssnOnline(tp._2)
|
||||||
case "print" => journal.setIssnPrinted(tp._2)
|
case "print" => journal.setIssnPrinted(tp._2)
|
||||||
|
case _ =>
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
|
||||||
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
||||||
|
|
||||||
private val MAGDataInfo: DataInfo = {
|
private val MAGDataInfo: DataInfo = {
|
||||||
val di = new DataInfo
|
|
||||||
di.setDeletedbyinference(false)
|
|
||||||
di.setInferred(false)
|
|
||||||
di.setInvisible(false)
|
|
||||||
di.setTrust("0.9")
|
|
||||||
di.setProvenanceaction(
|
|
||||||
OafMapperUtils.qualifier(
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS
|
|
||||||
)
|
|
||||||
)
|
|
||||||
di
|
|
||||||
}
|
|
||||||
|
|
||||||
private val MAGDataInfoInvisible: DataInfo = {
|
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
|
@ -453,7 +436,6 @@ object MagUtility extends Serializable {
|
||||||
|
|
||||||
case "repository" =>
|
case "repository" =>
|
||||||
result = new Publication()
|
result = new Publication()
|
||||||
result.setDataInfo(MAGDataInfoInvisible)
|
|
||||||
qualifier(
|
qualifier(
|
||||||
"0038",
|
"0038",
|
||||||
"Other literature type",
|
"Other literature type",
|
||||||
|
@ -488,7 +470,6 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
if (result.getDataInfo == null)
|
|
||||||
result.setDataInfo(MAGDataInfo)
|
result.setDataInfo(MAGDataInfo)
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
i.setInstancetype(tp)
|
i.setInstancetype(tp)
|
||||||
|
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
|
||||||
return null
|
return null
|
||||||
|
|
||||||
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||||
val pidList = List(
|
var pidList = List(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
paper.paperId.get.toString,
|
paper.paperId.get.toString,
|
||||||
qualifier(
|
qualifier(
|
||||||
|
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
result.setPid(pidList.asJava)
|
|
||||||
|
|
||||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||||
|
|
||||||
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
||||||
|
@ -618,10 +597,9 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
val instance = result.getInstance().get(0)
|
val instance = result.getInstance().get(0)
|
||||||
instance.setPid(pidList.asJava)
|
|
||||||
if (paper.doi.orNull != null)
|
if (paper.doi.orNull != null) {
|
||||||
instance.setAlternateIdentifier(
|
pidList = pidList ::: List(
|
||||||
List(
|
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
paper.doi.get,
|
paper.doi.get,
|
||||||
qualifier(
|
qualifier(
|
||||||
|
@ -632,8 +610,10 @@ object MagUtility extends Serializable {
|
||||||
),
|
),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
).asJava
|
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
instance.setPid(pidList.asJava)
|
||||||
|
result.setPid(pidList.asJava)
|
||||||
instance.setUrl(paper.urls.get.asJava)
|
instance.setUrl(paper.urls.get.asJava)
|
||||||
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||||
instance.setCollectedfrom(MAGCollectedFrom)
|
instance.setCollectedfrom(MAGCollectedFrom)
|
||||||
|
|
|
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||||
spark.read
|
spark.read
|
||||||
.load(s"$magBasePath/mag_denormalized")
|
.load(s"$magBasePath/mag_denormalized")
|
||||||
.as[MAGPaper]
|
.as[MAGPaper]
|
||||||
|
.filter(col("doi").isNotNull)
|
||||||
.map(s => MagUtility.convertMAGtoOAF(s))
|
.map(s => MagUtility.convertMAGtoOAF(s))
|
||||||
.filter(s => s != null)
|
.filter(s => s != null)
|
||||||
.write
|
.write
|
||||||
|
|
|
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
|
||||||
import org.apache.http.impl.client.HttpClientBuilder
|
import org.apache.http.impl.client.HttpClientBuilder
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.io.InputStream
|
import java.io.{ByteArrayInputStream, InputStream}
|
||||||
import scala.io.Source
|
import java.nio.charset.Charset
|
||||||
import scala.xml.pull.XMLEventReader
|
import javax.xml.stream.XMLInputFactory
|
||||||
|
|
||||||
object SparkCreateBaselineDataFrame {
|
object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
|
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
} else
|
} else
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
println(s"Error on requesting ${r.getURI}")
|
println(s"Error on requesting ${r.getURI}")
|
||||||
|
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||||
)
|
),
|
||||||
|
Charset.defaultCharset()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
log.info("workingPath: {}", workingPath)
|
log.info("workingPath: {}", workingPath)
|
||||||
|
|
||||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
log.info("targetPath: {}", targetPath)
|
||||||
|
|
||||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
||||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
||||||
log.info("outputBasePath: {}", outputBasePath)
|
|
||||||
|
|
||||||
val hdfsServerUri = parser.get("hdfsServerUri")
|
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||||
log.info("hdfsServerUri: {}", hdfsServerUri)
|
log.info("hdfsServerUri: {}", targetPath)
|
||||||
|
|
||||||
val skipUpdate = parser.get("skipUpdate")
|
val skipUpdate = parser.get("skipUpdate")
|
||||||
log.info("skipUpdate: {}", skipUpdate)
|
log.info("skipUpdate: {}", skipUpdate)
|
||||||
|
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||||
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||||
k.filter(i => i._1.endsWith(".gz"))
|
k.filter(i => i._1.endsWith(".gz"))
|
||||||
.flatMap(i => {
|
.flatMap(i => {
|
||||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
|
||||||
new PMParser(xml)
|
new PMParser(xml)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
.as[Oaf]
|
.as[Oaf]
|
||||||
.filter(p => p != null),
|
.filter(p => p != null),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
|
|
||||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
|
||||||
val mdStoreSize = df.count
|
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
import javax.xml.stream.XMLEventReader
|
||||||
|
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||||
|
|
||||||
/** @param xml
|
/** @param xml
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,9 @@ public class ReadCOCITest {
|
||||||
workingDir.toString() + "/COCI",
|
workingDir.toString() + "/COCI",
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/COCI_json/",
|
workingDir.toString() + "/COCI_json/",
|
||||||
"-inputFile", "input1;input2;input3;input4;input5"
|
"-inputFile", "input1;input2;input3;input4;input5",
|
||||||
|
"-format",
|
||||||
|
"COCI"
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
|
@ -75,7 +75,11 @@ public class CreateASTest {
|
||||||
|
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
|
||||||
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
CreateActionSetFromWebEntries
|
CreateActionSetFromWebEntries
|
||||||
|
@ -86,7 +90,8 @@ public class CreateASTest {
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/actionSet1"
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
@ -96,7 +101,7 @@ public class CreateASTest {
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
Assertions.assertEquals(64, tmp.count());
|
Assertions.assertEquals(58, tmp.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,6 +114,10 @@ public class CreateASTest {
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
CreateActionSetFromWebEntries
|
CreateActionSetFromWebEntries
|
||||||
.main(
|
.main(
|
||||||
|
@ -118,7 +127,8 @@ public class CreateASTest {
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/actionSet1"
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
@ -184,7 +194,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
5, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getSource()
|
.getSource()
|
||||||
|
@ -197,7 +207,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
5, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -210,7 +220,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, tmp
|
1, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -224,7 +234,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, tmp
|
1, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -238,7 +248,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1, tmp
|
0, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
|
|
@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest {
|
||||||
private final String resumptionType = "page";
|
private final String resumptionType = "page";
|
||||||
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
|
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
|
||||||
|
|
||||||
private final String resultSizeParam = "";
|
private final String resultSizeParam = "page[size]";
|
||||||
private final String resultSizeValue = "";
|
private final String resultSizeValue = "100";
|
||||||
|
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "format";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
|
@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
|
||||||
final AtomicInteger i = new AtomicInteger(0);
|
final AtomicInteger i = new AtomicInteger(0);
|
||||||
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
stream.limit(200).forEach(s -> {
|
stream.limit(2000).forEach(s -> {
|
||||||
Assertions.assertTrue(s.length() > 0);
|
Assertions.assertTrue(s.length() > 0);
|
||||||
i.incrementAndGet();
|
i.incrementAndGet();
|
||||||
log.info(s);
|
log.info(s);
|
||||||
|
|
|
@ -4,6 +4,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
@ -25,18 +32,18 @@ class RestCollectorPluginTest {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
||||||
|
|
||||||
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
|
||||||
private final String resumptionType = "count";
|
private final String resumptionType = "discover";
|
||||||
private final String resumptionParam = "from";
|
private final String resumptionParam = "skip";
|
||||||
private final String entityXpath = "//hits/hits";
|
private final String entityXpath = "//*[local-name()='data']";
|
||||||
private final String resumptionXpath = "//hits";
|
private final String resumptionXpath = "";
|
||||||
private final String resultTotalXpath = "//hits/total";
|
private final String resultTotalXpath = "//*[local-name()='count']";
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
private final String resultSizeParam = "size";
|
private final String resultSizeParam = "top";
|
||||||
private final String resultSizeValue = "10";
|
private final String resultSizeValue = "10";
|
||||||
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||||
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
|
private final String query = "";
|
||||||
// private String query = "=(sources:engrXiv AND type:preprint)";
|
// private String query = "=(sources:engrXiv AND type:preprint)";
|
||||||
|
|
||||||
private final String protocolDescriptor = "rest_json2xml";
|
private final String protocolDescriptor = "rest_json2xml";
|
||||||
|
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
|
||||||
params.put("resultSizeValue", resultSizeValue);
|
params.put("resultSizeValue", resultSizeValue);
|
||||||
params.put("queryParams", query);
|
params.put("queryParams", query);
|
||||||
params.put("entityXpath", entityXpath);
|
params.put("entityXpath", entityXpath);
|
||||||
|
params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
|
||||||
|
|
||||||
api.setBaseUrl(baseUrl);
|
api.setBaseUrl(baseUrl);
|
||||||
api.setParams(params);
|
api.setParams(params);
|
||||||
|
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
|
||||||
log.info("{}", i.intValue());
|
log.info("{}", i.intValue());
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testUrl() throws IOException {
|
||||||
|
String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
|
||||||
|
URL url = new URL(url_s);
|
||||||
|
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
conn.setRequestProperty("User-Agent", "OpenAIRE");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
System.out.println("Request header");
|
||||||
|
System.out.println(gson.toJson(conn.getHeaderFields()));
|
||||||
|
InputStream inputStream = conn.getInputStream();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class RestIteratorTest {
|
||||||
|
|
||||||
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
||||||
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
||||||
query, entityXpath, authMethod, authToken, resultOffsetParam);
|
query, entityXpath, authMethod, authToken, resultOffsetParam, null);
|
||||||
int i = 20;
|
int i = 20;
|
||||||
while (iterator.hasNext() && i > 0) {
|
while (iterator.hasNext() && i > 0) {
|
||||||
String result = iterator.next();
|
String result = iterator.next();
|
||||||
|
|
|
@ -789,10 +789,6 @@
|
||||||
"value": "2227-9717",
|
"value": "2227-9717",
|
||||||
"type": "electronic"
|
"type": "electronic"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"value": "VALUE",
|
|
||||||
"type": "PIPPO"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"value": "1063-4584",
|
"value": "1063-4584",
|
||||||
"type": "pu"
|
"type": "pu"
|
||||||
|
|
|
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import org.junit.jupiter.api.BeforeEach
|
import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.junit.jupiter.api.extension.ExtendWith
|
import org.junit.jupiter.api.extension.ExtendWith
|
||||||
import org.mockito.junit.jupiter.MockitoExtension
|
import org.mockito.junit.jupiter.MockitoExtension
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
|
||||||
super.setUpVocabulary()
|
super.setUpVocabulary()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def mappingRecord(): Unit = {
|
||||||
|
val input =
|
||||||
|
IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
|
||||||
|
|
||||||
|
println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
||||||
import org.apache.spark.sql.SparkSession
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.apache.spark.sql.functions.col
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
@ -18,10 +19,8 @@ class MAGMappingTest {
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
val s = new SparkMagOrganizationAS(null, null, null)
|
val s = new SparkMAGtoOAF(null, null, null)
|
||||||
|
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
|
||||||
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
|
||||||
|
|
||||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
|
import javax.xml.stream.XMLInputFactory
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable.ListBuffer
|
import scala.collection.mutable.ListBuffer
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
.mkString
|
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
|
||||||
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testParsingPubmedXML(): Unit = {
|
def testParsingPubmedXML(): Unit = {
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
|
||||||
)
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
parser.foreach(checkPMArticle)
|
parser.foreach(checkPMArticle)
|
||||||
}
|
}
|
||||||
|
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
@Test
|
@Test
|
||||||
def testPubmedMapping(): Unit = {
|
def testPubmedMapping(): Unit = {
|
||||||
|
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
)
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
val results = ListBuffer[Oaf]()
|
val results = ListBuffer[Oaf]()
|
||||||
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
||||||
|
|
|
@ -53,24 +53,10 @@
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>1.0.2</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>2.11.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
|
@ -79,16 +65,10 @@
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.arakelian</groupId>
|
|
||||||
<artifactId>java-jq</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
<artifactId>dom4j</artifactId>
|
<artifactId>dom4j</artifactId>
|
||||||
|
@ -101,10 +81,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
|
||||||
<artifactId>jackson-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
|
|
|
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
import scala.collection.JavaConversions;
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
|
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
Dataset<Row> pivotHistory = spark
|
Dataset<Row> pivotHistory = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
RowEncoder
|
SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
|
||||||
.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
|
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
||||||
pivotHistory = spark
|
pivotHistory = spark
|
||||||
|
|
|
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
||||||
"",
|
"",
|
||||||
r._1()._2().getOriginalId().get(0),
|
Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
|
Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
|
Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCollectedfrom().get(0).getValue(),
|
Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
|
||||||
"",
|
"",
|
||||||
structuredPropertyListToString(r._1()._2().getPid()),
|
structuredPropertyListToString(r._1()._2().getPid()),
|
||||||
parseECField(r._1()._2().getEclegalbody()),
|
parseECField(r._1()._2().getEclegalbody()),
|
||||||
|
|
|
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
final Organization o = r._2()._2();
|
final Organization o = r._2()._2();
|
||||||
return new OrgSimRel(
|
return new OrgSimRel(
|
||||||
r._1()._1(),
|
r._1()._1(),
|
||||||
o.getOriginalId().get(0),
|
Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
|
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
||||||
OrgSimRel orgSimRel = r._1()._2();
|
OrgSimRel orgSimRel = r._1()._2();
|
||||||
orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
|
orgSimRel
|
||||||
|
.setLocal_id(
|
||||||
|
Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
|
||||||
return orgSimRel;
|
return orgSimRel;
|
||||||
},
|
},
|
||||||
Encoders.bean(OrgSimRel.class));
|
Encoders.bean(OrgSimRel.class));
|
||||||
|
|
|
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
|
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
StructType idsSchema = StructType
|
StructType idsSchema = StructType
|
||||||
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||||
|
|
||||||
Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
|
Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
|
||||||
|
|
||||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||||
String entityPath = graphBasePath + '/' + entityType.name();
|
String entityPath = graphBasePath + '/' + entityType.name();
|
||||||
|
|
|
@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
||||||
subject.getQualifier().setClassname(vocabulary.getName());
|
subject.getQualifier().setClassname(vocabulary.getName());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
|
final String provenanceActionClassId = Optional
|
||||||
|
.ofNullable(subject.getDataInfo())
|
||||||
.map(DataInfo::getProvenanceaction)
|
.map(DataInfo::getProvenanceaction)
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false},
|
||||||
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
|
||||||
|
]
|
|
@ -0,0 +1,166 @@
|
||||||
|
{
|
||||||
|
"cites":{
|
||||||
|
"original":"Cites",
|
||||||
|
"inverse":"IsCitedBy"
|
||||||
|
},
|
||||||
|
"compiles":{
|
||||||
|
"original":"Compiles",
|
||||||
|
"inverse":"IsCompiledBy"
|
||||||
|
},
|
||||||
|
"continues":{
|
||||||
|
"original":"Continues",
|
||||||
|
"inverse":"IsContinuedBy"
|
||||||
|
},
|
||||||
|
"derives":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"describes":{
|
||||||
|
"original":"Describes",
|
||||||
|
"inverse":"IsDescribedBy"
|
||||||
|
},
|
||||||
|
"documents":{
|
||||||
|
"original":"Documents",
|
||||||
|
"inverse":"IsDocumentedBy"
|
||||||
|
},
|
||||||
|
"hasmetadata":{
|
||||||
|
"original":"HasMetadata",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"hasassociationwith":{
|
||||||
|
"original":"HasAssociationWith",
|
||||||
|
"inverse":"HasAssociationWith"
|
||||||
|
},
|
||||||
|
"haspart":{
|
||||||
|
"original":"HasPart",
|
||||||
|
"inverse":"IsPartOf"
|
||||||
|
},
|
||||||
|
"hasversion":{
|
||||||
|
"original":"HasVersion",
|
||||||
|
"inverse":"IsVersionOf"
|
||||||
|
},
|
||||||
|
"iscitedby":{
|
||||||
|
"original":"IsCitedBy",
|
||||||
|
"inverse":"Cites"
|
||||||
|
},
|
||||||
|
"iscompiledby":{
|
||||||
|
"original":"IsCompiledBy",
|
||||||
|
"inverse":"Compiles"
|
||||||
|
},
|
||||||
|
"iscontinuedby":{
|
||||||
|
"original":"IsContinuedBy",
|
||||||
|
"inverse":"Continues"
|
||||||
|
},
|
||||||
|
"isderivedfrom":{
|
||||||
|
"original":"IsDerivedFrom",
|
||||||
|
"inverse":"IsSourceOf"
|
||||||
|
},
|
||||||
|
"isdescribedby":{
|
||||||
|
"original":"IsDescribedBy",
|
||||||
|
"inverse":"Describes"
|
||||||
|
},
|
||||||
|
"isdocumentedby":{
|
||||||
|
"original":"IsDocumentedBy",
|
||||||
|
"inverse":"Documents"
|
||||||
|
},
|
||||||
|
"isidenticalto":{
|
||||||
|
"original":"IsIdenticalTo",
|
||||||
|
"inverse":"IsIdenticalTo"
|
||||||
|
},
|
||||||
|
"ismetadatafor":{
|
||||||
|
"original":"IsMetadataFor",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"ismetadataof":{
|
||||||
|
"original":"IsMetadataOf",
|
||||||
|
"inverse":"IsMetadataFor"
|
||||||
|
},
|
||||||
|
"isnewversionof":{
|
||||||
|
"original":"IsNewVersionOf",
|
||||||
|
"inverse":"IsPreviousVersionOf"
|
||||||
|
},
|
||||||
|
"isobsoletedby":{
|
||||||
|
"original":"IsObsoletedBy",
|
||||||
|
"inverse":"Obsoletes"
|
||||||
|
},
|
||||||
|
"isoriginalformof":{
|
||||||
|
"original":"IsOriginalFormOf",
|
||||||
|
"inverse":"IsVariantFormOf"
|
||||||
|
},
|
||||||
|
"ispartof":{
|
||||||
|
"original":"IsPartOf",
|
||||||
|
"inverse":"HasPart"
|
||||||
|
},
|
||||||
|
"ispreviousversionof":{
|
||||||
|
"original":"IsPreviousVersionOf",
|
||||||
|
"inverse":"IsNewVersionOf"
|
||||||
|
},
|
||||||
|
"isreferencedby":{
|
||||||
|
"original":"IsReferencedBy",
|
||||||
|
"inverse":"References"
|
||||||
|
},
|
||||||
|
"isrelatedto":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"isrequiredby":{
|
||||||
|
"original":"IsRequiredBy",
|
||||||
|
"inverse":"Requires"
|
||||||
|
},
|
||||||
|
"isreviewedby":{
|
||||||
|
"original":"IsReviewedBy",
|
||||||
|
"inverse":"Reviews"
|
||||||
|
},
|
||||||
|
"issourceof":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"issupplementedby":{
|
||||||
|
"original":"IsSupplementedBy",
|
||||||
|
"inverse":"IsSupplementTo"
|
||||||
|
},
|
||||||
|
"issupplementto":{
|
||||||
|
"original":"IsSupplementTo",
|
||||||
|
"inverse":"IsSupplementedBy"
|
||||||
|
},
|
||||||
|
"isvariantformof":{
|
||||||
|
"original":"IsVariantFormOf",
|
||||||
|
"inverse":"IsOriginalFormOf"
|
||||||
|
},
|
||||||
|
"isversionof":{
|
||||||
|
"original":"IsVersionOf",
|
||||||
|
"inverse":"HasVersion"
|
||||||
|
},
|
||||||
|
"obsoletes":{
|
||||||
|
"original":"Obsoletes",
|
||||||
|
"inverse":"IsObsoletedBy"
|
||||||
|
},
|
||||||
|
"references":{
|
||||||
|
"original":"References",
|
||||||
|
"inverse":"IsReferencedBy"
|
||||||
|
},
|
||||||
|
"requires":{
|
||||||
|
"original":"Requires",
|
||||||
|
"inverse":"IsRequiredBy"
|
||||||
|
},
|
||||||
|
"related":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"reviews":{
|
||||||
|
"original":"Reviews",
|
||||||
|
"inverse":"IsReviewedBy"
|
||||||
|
},
|
||||||
|
"unknown":{
|
||||||
|
"original":"Unknown",
|
||||||
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,258 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{
|
||||||
|
Scholix,
|
||||||
|
ScholixCollectedFrom,
|
||||||
|
ScholixEntityId,
|
||||||
|
ScholixIdentifier,
|
||||||
|
ScholixRelationship,
|
||||||
|
ScholixResource
|
||||||
|
}
|
||||||
|
import org.json4s
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
case class RelationInfo(
|
||||||
|
source: String,
|
||||||
|
target: String,
|
||||||
|
relclass: String,
|
||||||
|
id: String,
|
||||||
|
collectedfrom: Seq[RelKeyValue]
|
||||||
|
) {}
|
||||||
|
case class RelKeyValue(key: String, value: String) {}
|
||||||
|
|
||||||
|
object ScholexplorerUtils {
|
||||||
|
|
||||||
|
val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
case class RelationVocabulary(original: String, inverse: String) {}
|
||||||
|
|
||||||
|
val relations: Map[String, RelationVocabulary] = {
|
||||||
|
val input = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
|
||||||
|
json.extract[Map[String, RelationVocabulary]]
|
||||||
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDatasourceOpenAIREURLS(id: String): String = {
|
||||||
|
if (id != null && id.length > 12)
|
||||||
|
s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def findURLForPID(
|
||||||
|
pidValue: List[StructuredProperty],
|
||||||
|
urls: List[String]
|
||||||
|
): List[(StructuredProperty, String)] = {
|
||||||
|
pidValue.map { p =>
|
||||||
|
val pv = p.getValue
|
||||||
|
|
||||||
|
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||||
|
(p, r.orNull)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||||
|
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||||
|
return List()
|
||||||
|
r.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||||
|
.filter(i => i.getPid != null && i.getUrl != null)
|
||||||
|
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||||
|
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||||
|
.distinct
|
||||||
|
.toList
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResourceFromResult(result: Result): ScholixResource = {
|
||||||
|
|
||||||
|
if (result.getInstance() == null || result.getInstance().size() == 0)
|
||||||
|
return null
|
||||||
|
|
||||||
|
if (result.getPid == null || result.getPid.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
val r = new ScholixResource
|
||||||
|
r.setDnetIdentifier(result.getId)
|
||||||
|
|
||||||
|
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
|
||||||
|
if (persistentIdentifiers.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
r.setIdentifier(persistentIdentifiers.asJava)
|
||||||
|
|
||||||
|
r.setObjectType(result.getResulttype.getClassid)
|
||||||
|
|
||||||
|
r.setObjectSubType(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i != null && i.getInstancetype != null)
|
||||||
|
.map(i => i.getInstancetype.getClassname)
|
||||||
|
.distinct
|
||||||
|
.head
|
||||||
|
)
|
||||||
|
|
||||||
|
if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
|
||||||
|
val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
|
||||||
|
if (titles.nonEmpty)
|
||||||
|
r.setTitle(titles.head)
|
||||||
|
else
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
if (result.getAuthor != null && !result.getAuthor.isEmpty) {
|
||||||
|
val authors: List[ScholixEntityId] =
|
||||||
|
result.getAuthor.asScala
|
||||||
|
.map(a => {
|
||||||
|
val entity = new ScholixEntityId()
|
||||||
|
entity.setName(a.getFullname)
|
||||||
|
if (a.getPid != null && a.getPid.size() > 0)
|
||||||
|
entity.setIdentifiers(
|
||||||
|
a.getPid.asScala
|
||||||
|
.map(sp => {
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(sp.getValue)
|
||||||
|
id.setSchema(sp.getQualifier.getClassid)
|
||||||
|
id
|
||||||
|
})
|
||||||
|
.take(3)
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
entity
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
if (authors.nonEmpty)
|
||||||
|
r.setCreator(authors.asJava)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
val dt: List[String] = result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getDateofacceptance != null)
|
||||||
|
.map(i => i.getDateofacceptance.getValue)
|
||||||
|
.toList
|
||||||
|
if (dt.nonEmpty)
|
||||||
|
r.setPublicationDate(dt.distinct.head)
|
||||||
|
|
||||||
|
r.setPublisher(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.map(i => i.getHostedby)
|
||||||
|
.filter(h => !"unknown".equalsIgnoreCase(h.getValue))
|
||||||
|
.map(h => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(h.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(h.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.distinct
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r.setCollectedFrom(
|
||||||
|
result.getCollectedfrom.asScala
|
||||||
|
.map(cf => {
|
||||||
|
val scf = new ScholixCollectedFrom()
|
||||||
|
scf.setProvisionMode("collected")
|
||||||
|
scf.setCompletionStatus("complete")
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
scf.setProvider(eid)
|
||||||
|
scf
|
||||||
|
})
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
|
||||||
|
val s: Scholix = new Scholix
|
||||||
|
s.setSource(source)
|
||||||
|
if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
|
||||||
|
s.setLinkprovider(
|
||||||
|
relation.collectedfrom
|
||||||
|
.map(cf => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.value)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.key)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
else {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName("OpenAIRE")
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
s.setLinkprovider(List(eid).asJava)
|
||||||
|
}
|
||||||
|
s.setIdentifier(relation.id)
|
||||||
|
val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
|
||||||
|
if (semanticRelation == null)
|
||||||
|
return null
|
||||||
|
s.setRelationship(
|
||||||
|
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||||
|
)
|
||||||
|
s.setPublicationDate(source.getPublicationDate)
|
||||||
|
s.setPublisher(source.getPublisher)
|
||||||
|
val mockTarget = new ScholixResource
|
||||||
|
mockTarget.setDnetIdentifier(relation.target)
|
||||||
|
s.setTarget(mockTarget)
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
def updateTarget(s: Scholix, t: ScholixResource): String = {
|
||||||
|
|
||||||
|
s.setTarget(t)
|
||||||
|
val spublishers: Seq[ScholixEntityId] =
|
||||||
|
if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
|
||||||
|
val tpublishers: Seq[ScholixEntityId] =
|
||||||
|
if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
|
||||||
|
val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
|
||||||
|
s.setPublisher(mergedPublishers.asJava)
|
||||||
|
mapper.writeValueAsString(s)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,141 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{
|
||||||
|
KeyValue,
|
||||||
|
OtherResearchProduct,
|
||||||
|
Publication,
|
||||||
|
Relation,
|
||||||
|
Result,
|
||||||
|
Software,
|
||||||
|
Dataset => OafDataset
|
||||||
|
}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
|
||||||
|
import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
/** Here all the spark applications runs this method
|
||||||
|
* where the whole logic of the spark node is defined
|
||||||
|
*/
|
||||||
|
override def run(): Unit = {
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
log.info("sourcePath: {}", sourcePath)
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
log.info("targetPath: {}", targetPath)
|
||||||
|
generateBidirectionalRelations(sourcePath, targetPath, spark)
|
||||||
|
generateScholixResource(sourcePath, targetPath, spark)
|
||||||
|
generateScholix(targetPath, spark)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val entityMap: Map[String, StructType] = Map(
|
||||||
|
"publication" -> Encoders.bean(classOf[Publication]).schema,
|
||||||
|
"dataset" -> Encoders.bean(classOf[OafDataset]).schema,
|
||||||
|
"software" -> Encoders.bean(classOf[Software]).schema,
|
||||||
|
"otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
|
||||||
|
)
|
||||||
|
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
||||||
|
|
||||||
|
val resDs = spark.emptyDataset[ScholixResource]
|
||||||
|
val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
|
||||||
|
println(s"adding ${item._1}")
|
||||||
|
res.union(
|
||||||
|
spark.read
|
||||||
|
.schema(item._2)
|
||||||
|
.json(s"$inputPath/${item._1}")
|
||||||
|
.as[Result]
|
||||||
|
.map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
|
||||||
|
.filter(s => s != null)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val relSchema = Encoders.bean(classOf[Relation]).schema
|
||||||
|
|
||||||
|
val relDF = spark.read
|
||||||
|
.schema(relSchema)
|
||||||
|
.json(s"$inputPath/relation")
|
||||||
|
.where(
|
||||||
|
"datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
|
||||||
|
"and relClass <> 'merges' and relClass <> 'isMergedIn'"
|
||||||
|
)
|
||||||
|
.select("source", "target", "collectedfrom", "relClass")
|
||||||
|
|
||||||
|
def invRel: String => String = { s =>
|
||||||
|
ScholexplorerUtils.invRel(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
import org.apache.spark.sql.functions.udf
|
||||||
|
val inverseRelationUDF = udf(invRel)
|
||||||
|
val inverseRelation = relDF.select(
|
||||||
|
col("target").alias("source"),
|
||||||
|
col("source").alias("target"),
|
||||||
|
col("collectedfrom"),
|
||||||
|
inverseRelationUDF(col("relClass")).alias("relClass")
|
||||||
|
)
|
||||||
|
|
||||||
|
val bidRel = inverseRelation
|
||||||
|
.union(relDF)
|
||||||
|
.withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
|
||||||
|
.withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
|
||||||
|
.drop("collectedfrom")
|
||||||
|
.withColumnRenamed("cf", "collectedfrom")
|
||||||
|
.groupBy(col("id"))
|
||||||
|
.agg(
|
||||||
|
first("source").alias("source"),
|
||||||
|
first("target").alias("target"),
|
||||||
|
first("relClass").alias("relClass"),
|
||||||
|
first("collectedfrom").alias("collectedfrom")
|
||||||
|
)
|
||||||
|
|
||||||
|
bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
|
||||||
|
val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
|
||||||
|
|
||||||
|
val scholix_one_verse = relations
|
||||||
|
.joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
|
||||||
|
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
|
||||||
|
|
||||||
|
val resourceTarget = relations
|
||||||
|
.joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
|
||||||
|
|
||||||
|
scholix_one_verse
|
||||||
|
.joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
|
||||||
|
.map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$outputPath/scholix")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object SparkCreateScholexplorerDump {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
new SparkCreateScholexplorerDump(
|
||||||
|
log = logger,
|
||||||
|
args = args,
|
||||||
|
propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
|
||||||
|
).initialize().run()
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph.scholix
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
|
||||||
|
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.objenesis.strategy.StdInstantiatorStrategy
|
||||||
|
|
||||||
|
class ScholixGenerationTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def generateScholix(): Unit = {
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
|
||||||
|
val app = new SparkCreateScholexplorerDump(null, null, null)
|
||||||
|
// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
|
||||||
|
// app.generateBidirectionalRelations(
|
||||||
|
// "/home/sandro/Downloads/scholix_sample/",
|
||||||
|
// "/home/sandro/Downloads/scholix/",
|
||||||
|
// spark
|
||||||
|
// )
|
||||||
|
app.generateScholix("/home/sandro/Downloads/scholix/", spark)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,7 +18,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -59,12 +59,6 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-api</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
|
@ -160,6 +154,26 @@
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
</exclusion>
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>ant</artifactId>
|
||||||
|
<groupId>org.apache.ant</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>antlr4-runtime</artifactId>
|
||||||
|
<groupId>org.antlr</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>woodstox-core</artifactId>
|
||||||
|
<groupId>com.fasterxml.woodstox</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
<groupId>*</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -206,5 +220,90 @@
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-3</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
|
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
|
||||||
|
import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(
|
.map(
|
||||||
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
|
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
|
||||||
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
|
||||||
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
|
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import javax.swing.text.html.Option;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.stringtemplate.v4.ST;
|
import org.stringtemplate.v4.ST;
|
||||||
|
|
||||||
|
|
|
@ -170,30 +170,19 @@ public class XmlSerializationUtils {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// <measure downloads="0" views="0">infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE</measure>
|
// <measure views="0" datasource="infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE" />
|
||||||
|
// <measure downloads="0" datasource="infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE" />
|
||||||
public static String usageMeasureAsXmlElement(String name, Measure measure) {
|
public static String usageMeasureAsXmlElement(String name, Measure measure) {
|
||||||
HashSet<String> dsIds = Optional
|
|
||||||
.ofNullable(measure.getUnit())
|
|
||||||
.map(
|
|
||||||
m -> m
|
|
||||||
.stream()
|
|
||||||
.map(KeyValue::getKey)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new)))
|
|
||||||
.orElse(new HashSet<>());
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
dsIds.forEach(dsId -> {
|
for (KeyValue kv : measure.getUnit()) {
|
||||||
sb
|
sb
|
||||||
.append("<")
|
.append("<")
|
||||||
.append(name);
|
.append(name)
|
||||||
for (KeyValue kv : measure.getUnit()) {
|
|
||||||
sb.append(" ").append(attr(measure.getId(), kv.getValue()));
|
|
||||||
}
|
|
||||||
sb
|
|
||||||
.append(" ")
|
.append(" ")
|
||||||
.append(attr("datasource", dsId))
|
.append(attr(measure.getId(), kv.getValue()))
|
||||||
.append("/>");
|
.append(attr("datasource", kv.getKey()))
|
||||||
});
|
.append(" />");
|
||||||
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,8 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>validateXML</name>
|
<name>validateXML</name>
|
||||||
<description>should the payload converter validate the XMLs</description>
|
|
||||||
<value>false</value>
|
<value>false</value>
|
||||||
|
<description>should the payload converter validate the XMLs</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>relPartitions</name>
|
<name>relPartitions</name>
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.dhp.sparksolr;
|
||||||
|
|
||||||
|
import com.lucidworks.spark.util.SolrSupport;
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
|
||||||
|
public class DHPSolrSupport {
|
||||||
|
|
||||||
|
static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
|
||||||
|
SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.dhp.sparksolr;
|
||||||
|
|
||||||
|
import com.lucidworks.spark.util.SolrSupport;
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.spark.rdd.RDD;
|
||||||
|
|
||||||
|
public class DHPSolrSupport {
|
||||||
|
|
||||||
|
static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
|
||||||
|
SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,11 +16,11 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -10,11 +10,11 @@
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -8,6 +8,8 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -67,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -85,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -109,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -127,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -176,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -204,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,11 +10,11 @@
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -8,6 +8,9 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -66,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -84,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -108,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -126,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -175,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -203,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,11 +10,11 @@
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<build>
|
<build>
|
||||||
|
|
|
@ -8,6 +8,9 @@ fi
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$2
|
export HADOOP_USER_NAME=$2
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||||
|
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||||
|
|
||||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
|
|
||||||
|
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
function print_elapsed_time()
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
{
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
elapsed_time=$(($end_time-$start_time))
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
hours=$((elapsed_time / 3600))
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -66,7 +70,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -84,17 +92,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -108,17 +129,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -126,12 +143,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -175,7 +197,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -203,11 +227,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@ then
|
||||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
|
||||||
|
|
||||||
|
|
||||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||||
|
@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
|
||||||
done
|
done
|
||||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||||
|
|
||||||
|
@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||||
# Set sed arguments.
|
# Set sed arguments.
|
||||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||||
|
|
||||||
# Set the SED command arguments for column-names with reserved words:
|
|
||||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
|
||||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
|
||||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
|
||||||
|
|
||||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
|
||||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
|
||||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
|
||||||
|
|
||||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
|
||||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
|
||||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
|
||||||
|
|
||||||
|
|
||||||
export HADOOP_USER_NAME=$6
|
export HADOOP_USER_NAME=$6
|
||||||
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
||||||
|
|
||||||
|
|
||||||
|
function print_elapsed_time()
|
||||||
|
{
|
||||||
|
start_time=$1
|
||||||
|
end_time=$(date +%s)
|
||||||
|
elapsed_time=$(($end_time-$start_time))
|
||||||
|
hours=$((elapsed_time / 3600))
|
||||||
|
minutes=$(((elapsed_time % 3600) / 60))
|
||||||
|
seconds=$((elapsed_time % 60))
|
||||||
|
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function copydb() {
|
function copydb() {
|
||||||
db=$1
|
db=$1
|
||||||
|
start_db_time=$(date +%s)
|
||||||
echo -e "\nStart processing db: '${db}'..\n"
|
echo -e "\nStart processing db: '${db}'..\n"
|
||||||
|
|
||||||
# Delete the old DB from Impala cluster (if exists).
|
# Delete the old DB from Impala cluster (if exists).
|
||||||
|
@ -68,7 +72,11 @@ function copydb() {
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 2
|
exit 2
|
||||||
|
else
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||||
|
@ -86,17 +94,30 @@ function copydb() {
|
||||||
-pb \
|
-pb \
|
||||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||||
|
|
||||||
# Check the exit status of the "hadoop distcp" command.
|
if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
|
||||||
if [ $? -eq 0 ]; then
|
echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
|
||||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
exit 3
|
exit 3
|
||||||
|
else
|
||||||
|
return 3
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
# Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
|
||||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
|
||||||
|
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
|
||||||
|
if [ $? -ne 0 ]; then # Check the exit status..
|
||||||
|
echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
|
||||||
|
rm -f error.log
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 4
|
||||||
|
else
|
||||||
|
return 4
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo -e "\nCreating schema for db: '${db}'\n"
|
echo -e "\nCreating schema for db: '${db}'\n"
|
||||||
|
|
||||||
|
@ -110,17 +131,13 @@ function copydb() {
|
||||||
num_tables=0
|
num_tables=0
|
||||||
|
|
||||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
|
||||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
|
||||||
|
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
|
||||||
if [ -n "$create_view_statement_test" ]; then
|
if [ -n "$create_view_statement_test" ]; then
|
||||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
|
||||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
|
||||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
|
||||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
|
||||||
all_create_view_statements+=("$create_view_statement")
|
all_create_view_statements+=("$create_view_statement")
|
||||||
else
|
else
|
||||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||||
|
@ -128,12 +145,17 @@ function copydb() {
|
||||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 5
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||||
if [ -n "$log_errors" ]; then
|
if [ -n "$log_errors" ]; then
|
||||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||||
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -177,7 +199,9 @@ function copydb() {
|
||||||
|
|
||||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||||
exit 5
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 7
|
||||||
|
fi
|
||||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||||
else
|
else
|
||||||
|
@ -205,11 +229,14 @@ function copydb() {
|
||||||
else
|
else
|
||||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
exit 6
|
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
|
||||||
|
exit 8
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -f error.log
|
rm -f error.log
|
||||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
echo -e "\n\nFinished processing db: ${db}\n"
|
||||||
|
print_elapsed_time start_db_time
|
||||||
}
|
}
|
||||||
|
|
||||||
STATS_DB=$1
|
STATS_DB=$1
|
||||||
|
|
|
@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
|
||||||
with
|
with
|
||||||
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
|
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
|
||||||
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
|
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
|
||||||
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
|
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
|
||||||
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
|
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
|
||||||
|
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
|
||||||
from lvl1
|
from lvl1
|
||||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
|
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||||
|
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
|
||||||
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
|
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
|
(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
|
||||||
Encoders.bean(Software.class))
|
Encoders.bean(Software.class))
|
||||||
.filter(t -> t.getCodeRepositoryUrl() != null)
|
.filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
|
||||||
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
|
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -39,8 +39,8 @@
|
||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
|
<cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
|
||||||
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
|
<cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
@ -72,6 +72,12 @@
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-common</artifactId>
|
<artifactId>hadoop-common</artifactId>
|
||||||
<version>${cdh.hadoop.version}</version>
|
<version>${cdh.hadoop.version}</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>jdk.tools</groupId>
|
||||||
|
<artifactId>jdk.tools</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
|
|
@ -39,8 +39,8 @@
|
||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version>
|
<cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
|
||||||
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version>
|
<cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
@ -67,11 +67,23 @@
|
||||||
<groupId>org.apache.hive</groupId>
|
<groupId>org.apache.hive</groupId>
|
||||||
<artifactId>hive-jdbc</artifactId>
|
<artifactId>hive-jdbc</artifactId>
|
||||||
<version>${cdh.hive.version}</version>
|
<version>${cdh.hive.version}</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>jdk.tools</groupId>
|
||||||
|
<artifactId>jdk.tools</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.hadoop</groupId>
|
<groupId>org.apache.hadoop</groupId>
|
||||||
<artifactId>hadoop-common</artifactId>
|
<artifactId>hadoop-common</artifactId>
|
||||||
<version>${cdh.hadoop.version}</version>
|
<version>${cdh.hadoop.version}</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>jdk.tools</groupId>
|
||||||
|
<artifactId>jdk.tools</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
|
306
pom.xml
306
pom.xml
|
@ -13,7 +13,8 @@
|
||||||
<distribution>repo</distribution>
|
<distribution>repo</distribution>
|
||||||
<comments>This program is free software: you can redistribute it and/or modify it under the terms of the
|
<comments>This program is free software: you can redistribute it and/or modify it under the terms of the
|
||||||
GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
|
GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
|
||||||
License, or (at your option) any later version.</comments>
|
License, or (at your option) any later version.
|
||||||
|
</comments>
|
||||||
</license>
|
</license>
|
||||||
</licenses>
|
</licenses>
|
||||||
|
|
||||||
|
@ -22,6 +23,7 @@
|
||||||
<module>dhp-pace-core</module>
|
<module>dhp-pace-core</module>
|
||||||
<module>dhp-common</module>
|
<module>dhp-common</module>
|
||||||
<module>dhp-workflows</module>
|
<module>dhp-workflows</module>
|
||||||
|
<module>dhp-shade-package</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
<issueManagement>
|
<issueManagement>
|
||||||
|
@ -47,6 +49,19 @@
|
||||||
</pluginRepositories>
|
</pluginRepositories>
|
||||||
|
|
||||||
<repositories>
|
<repositories>
|
||||||
|
|
||||||
|
<repository>
|
||||||
|
<id>Openaire-third-parties-snaphot</id>
|
||||||
|
<name>Openaire third parties Snapshot</name>
|
||||||
|
<url>https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/</url>
|
||||||
|
<releases>
|
||||||
|
<enabled>false</enabled>
|
||||||
|
</releases>
|
||||||
|
<snapshots>
|
||||||
|
<enabled>true</enabled>
|
||||||
|
</snapshots>
|
||||||
|
</repository>
|
||||||
|
|
||||||
<repository>
|
<repository>
|
||||||
<id>dnet45-releases</id>
|
<id>dnet45-releases</id>
|
||||||
<name>D-Net 45 releases</name>
|
<name>D-Net 45 releases</name>
|
||||||
|
@ -125,6 +140,13 @@
|
||||||
</repositories>
|
</repositories>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<!-- Quick FIX not to remove lombok everywhere -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.28</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.junit.jupiter</groupId>
|
<groupId>org.junit.jupiter</groupId>
|
||||||
<artifactId>junit-jupiter</artifactId>
|
<artifactId>junit-jupiter</artifactId>
|
||||||
|
@ -152,7 +174,7 @@
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
<version>${dhp-schemas.version}</version>
|
<version>${dhp-schemas.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -206,33 +228,76 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
<artifactId>jcl-over-slf4j</artifactId>
|
<artifactId>slf4j-api</artifactId>
|
||||||
<version>1.7.25</version>
|
<version>${org.slf4j.version}</version>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
|
<version>${org.slf4j.version}</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>jcl-over-slf4j</artifactId>
|
||||||
|
<version>${org.slf4j.version}</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||||
|
<version>${log4j.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-api</artifactId>
|
||||||
|
<version>${log4j.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-core</artifactId>
|
||||||
|
<version>${log4j.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<!-- API bridge between log4j 1 and 2 -->
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>log4j-1.2-api</artifactId>
|
||||||
|
<version>${log4j.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>${dhp.commons.lang.version}</version>
|
<version>${dhp.commons.lang.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-beanutils</artifactId>
|
||||||
|
<version>${commons-beanutils.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-validator</groupId>
|
<groupId>commons-validator</groupId>
|
||||||
<artifactId>commons-validator</artifactId>
|
<artifactId>commons-validator</artifactId>
|
||||||
<version>1.7</version>
|
<version>${commons-validator.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
<version>1.0.7</version>
|
<version>${dateparser.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>me.xuender</groupId>
|
<groupId>me.xuender</groupId>
|
||||||
<artifactId>unidecode</artifactId>
|
<artifactId>unidecode</artifactId>
|
||||||
<version>0.0.7</version>
|
<version>${unidecode.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -245,13 +310,13 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-codec</groupId>
|
<groupId>commons-codec</groupId>
|
||||||
<artifactId>commons-codec</artifactId>
|
<artifactId>commons-codec</artifactId>
|
||||||
<version>1.9</version>
|
<version>${commons-codec.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-io</groupId>
|
<groupId>commons-io</groupId>
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
<version>2.4</version>
|
<version>${commons-io.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -345,7 +410,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
<version>3.4.11</version>
|
<version>${zookeeper.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -415,6 +480,7 @@
|
||||||
<artifactId>cxf-rt-transports-http</artifactId>
|
<artifactId>cxf-rt-transports-http</artifactId>
|
||||||
<version>3.1.5</version>
|
<version>3.1.5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>javax.persistence</groupId>
|
<groupId>javax.persistence</groupId>
|
||||||
<artifactId>javax.persistence-api</artifactId>
|
<artifactId>javax.persistence-api</artifactId>
|
||||||
|
@ -504,16 +570,11 @@
|
||||||
<artifactId>commons-compress</artifactId>
|
<artifactId>commons-compress</artifactId>
|
||||||
<version>${common.compress.version}</version>
|
<version>${common.compress.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-csv</artifactId>
|
<artifactId>commons-csv</artifactId>
|
||||||
<version>${common.csv.version}</version>
|
<version>${common.csv.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.poi</groupId>
|
<groupId>org.apache.poi</groupId>
|
||||||
<artifactId>poi-ooxml</artifactId>
|
<artifactId>poi-ooxml</artifactId>
|
||||||
|
@ -568,14 +629,12 @@
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-math3</artifactId>
|
<artifactId>commons-math3</artifactId>
|
||||||
<version>3.6.1</version>
|
<version>3.6.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.code.gson</groupId>
|
<groupId>com.google.code.gson</groupId>
|
||||||
<artifactId>gson</artifactId>
|
<artifactId>gson</artifactId>
|
||||||
|
@ -596,7 +655,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.reflections</groupId>
|
<groupId>org.reflections</groupId>
|
||||||
<artifactId>reflections</artifactId>
|
<artifactId>reflections</artifactId>
|
||||||
<version>0.9.10</version>
|
<version>${reflections.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -610,6 +669,12 @@
|
||||||
<artifactId>icu4j</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
<version>70.1</version>
|
<version>70.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.javassist</groupId>
|
||||||
|
<artifactId>javassist</artifactId>
|
||||||
|
<version>${javassist.version}</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</dependencyManagement>
|
</dependencyManagement>
|
||||||
|
|
||||||
|
@ -677,6 +742,7 @@
|
||||||
<version>3.0.0-M4</version>
|
<version>3.0.0-M4</version>
|
||||||
<configuration>
|
<configuration>
|
||||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||||
|
<trimStackTrace>false</trimStackTrace>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
|
@ -746,7 +812,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>net.revelc.code</groupId>
|
<groupId>net.revelc.code</groupId>
|
||||||
<artifactId>impsort-maven-plugin</artifactId>
|
<artifactId>impsort-maven-plugin</artifactId>
|
||||||
<version>1.4.1</version>
|
<version>1.6.2</version>
|
||||||
<configuration>
|
<configuration>
|
||||||
<groups>java.,javax.,org.,com.</groups>
|
<groups>java.,javax.,org.,com.</groups>
|
||||||
<staticGroups>java,*</staticGroups>
|
<staticGroups>java,*</staticGroups>
|
||||||
|
@ -767,7 +833,9 @@
|
||||||
<groupId>org.antipathy</groupId>
|
<groupId>org.antipathy</groupId>
|
||||||
<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
|
<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
|
||||||
<configuration>
|
<configuration>
|
||||||
<configLocation>https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
|
<configLocation>
|
||||||
|
https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
|
||||||
|
</configLocation>
|
||||||
<skipTestSources>false</skipTestSources>
|
<skipTestSources>false</skipTestSources>
|
||||||
<skipSources>false</skipSources>
|
<skipSources>false</skipSources>
|
||||||
<sourceDirectories>
|
<sourceDirectories>
|
||||||
|
@ -798,7 +866,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.jacoco</groupId>
|
<groupId>org.jacoco</groupId>
|
||||||
<artifactId>jacoco-maven-plugin</artifactId>
|
<artifactId>jacoco-maven-plugin</artifactId>
|
||||||
<version>0.7.9</version>
|
<version>0.8.10</version>
|
||||||
<configuration>
|
<configuration>
|
||||||
<excludes>
|
<excludes>
|
||||||
<exclude>**/schemas/*</exclude>
|
<exclude>**/schemas/*</exclude>
|
||||||
|
@ -866,90 +934,174 @@
|
||||||
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
|
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
|
||||||
<maven.compiler.source>1.8</maven.compiler.source>
|
<maven.compiler.source>1.8</maven.compiler.source>
|
||||||
<maven.compiler.target>1.8</maven.compiler.target>
|
<maven.compiler.target>1.8</maven.compiler.target>
|
||||||
<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
|
|
||||||
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
|
<!-- scala version -->
|
||||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
|
||||||
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
|
|
||||||
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
|
||||||
<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
|
|
||||||
<sparksolr.version>3.6.0</sparksolr.version>
|
|
||||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
|
||||||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
|
||||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
|
||||||
<dhp.site.skip>true</dhp.site.skip>
|
|
||||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
|
||||||
<scala.version>2.11.12</scala.version>
|
<scala.version>2.11.12</scala.version>
|
||||||
<scala.binary.version>2.11</scala.binary.version>
|
<scala.binary.version>2.11</scala.binary.version>
|
||||||
<scala-xml.version>1.3.0</scala-xml.version>
|
|
||||||
<junit-jupiter.version>5.6.1</junit-jupiter.version>
|
<!-- plugin versions -->
|
||||||
<mockito-core.version>3.3.3</mockito-core.version>
|
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
||||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
|
||||||
<vtd.version>[2.12,3.0)</vtd.version>
|
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
|
||||||
|
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
|
||||||
|
|
||||||
|
<!-- dependency versions -->
|
||||||
|
<apache.poi.version>4.1.2</apache.poi.version>
|
||||||
|
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
|
||||||
|
<common.compress.version>1.20</common.compress.version>
|
||||||
|
<common.csv.version>1.8</common.csv.version>
|
||||||
|
<common.text.version>1.8</common.text.version>
|
||||||
|
<commons-beanutils.version>1.9.4</commons-beanutils.version>
|
||||||
|
<commons-codec.version>1.9</commons-codec.version>
|
||||||
|
<commons.collections.version>3.2.1</commons.collections.version>
|
||||||
|
<commons-io.version>2.4</commons-io.version>
|
||||||
|
<commons.logging.version>1.1.3</commons.logging.version>
|
||||||
|
<commons-validator.version>1.7</commons-validator.version>
|
||||||
|
<dateparser.version>1.0.7</dateparser.version>
|
||||||
<dhp-schemas.version>[6.1.2]</dhp-schemas.version>
|
<dhp-schemas.version>[6.1.2]</dhp-schemas.version>
|
||||||
|
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||||
|
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||||
|
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||||
|
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
|
||||||
|
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||||
|
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
||||||
|
<dhp.site.skip>true</dhp.site.skip>
|
||||||
|
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||||
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
|
<google.gson.version>2.2.2</google.gson.version>
|
||||||
<solr.version>7.5.0</solr.version>
|
<log4j.version>1.2.17</log4j.version>
|
||||||
<okhttp.version>4.7.2</okhttp.version>
|
<javassist.version>3.19.0-GA</javassist.version>
|
||||||
<common.compress.version>1.20</common.compress.version>
|
|
||||||
<json4s.version>3.5.3</json4s.version>
|
<json4s.version>3.5.3</json4s.version>
|
||||||
<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
|
<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
|
||||||
<common.csv.version>1.8</common.csv.version>
|
<junit-jupiter.version>5.6.1</junit-jupiter.version>
|
||||||
<apache.poi.version>4.1.2</apache.poi.version>
|
<mockito-core.version>3.3.3</mockito-core.version>
|
||||||
<common.text.version>1.8</common.text.version>
|
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||||
|
<okhttp.version>4.7.2</okhttp.version>
|
||||||
<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
|
<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
|
||||||
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
|
<org.slf4j.version>1.7.25</org.slf4j.version>
|
||||||
<google.gson.version>2.2.2</google.gson.version>
|
<reflections.version>0.9.10</reflections.version>
|
||||||
<commons.logging.version>1.1.3</commons.logging.version>
|
<scala-xml.version>1.3.0</scala-xml.version>
|
||||||
<commons.collections.version>3.2.1</commons.collections.version>
|
<solr.version>7.5.0</solr.version>
|
||||||
|
<sparksolr.version>3.6.0</sparksolr.version>
|
||||||
|
<unidecode.version>0.0.7</unidecode.version>
|
||||||
|
<vtd.version>[2.12,3.0)</vtd.version>
|
||||||
|
<zookeeper.version>3.4.6</zookeeper.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<!-- Build with scala 12 and Spark 3.4 -->
|
<!-- Build with scala 12 and Spark 3.4 -->
|
||||||
<profiles>
|
<profiles>
|
||||||
<profile>
|
<profile>
|
||||||
<id>scala-2.12</id>
|
<id>spark-34</id>
|
||||||
<properties>
|
<properties>
|
||||||
<scala.binary.version>2.12</scala.binary.version>
|
<scala.binary.version>2.12</scala.binary.version>
|
||||||
<scala.version>2.12.18</scala.version>
|
<scala.version>2.12.18</scala.version>
|
||||||
<!-- scala-xml.version>2.1.0</scala-xml.version -->
|
<scala-xml.version>1.3.0</scala-xml.version>
|
||||||
|
|
||||||
|
<!-- plugin versions -->
|
||||||
<sparksolr.version>4.0.2</sparksolr.version>
|
|
||||||
<dhp.spark.version>3.4.1</dhp.spark.version>
|
|
||||||
<dhp.jackson.version>2.14.2</dhp.jackson.version>
|
|
||||||
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
|
|
||||||
<json4s.version>3.7.0-M11</json4s.version>
|
|
||||||
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
|
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
|
||||||
|
|
||||||
<!--
|
<!-- dependencies -->
|
||||||
<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
|
<common.compress.version>1.22</common.compress.version>
|
||||||
<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
|
<common.csv.version>1.8</common.csv.version>
|
||||||
-->
|
<common.text.version>1.10.0</common.text.version>
|
||||||
|
<commons-beanutils.version>1.9.4</commons-beanutils.version>
|
||||||
|
<commons-codec.version>1.15</commons-codec.version>
|
||||||
|
<commons.collections.version>3.2.2</commons.collections.version>
|
||||||
|
<commons-io.version>2.11.0</commons-io.version>
|
||||||
|
<commons.logging.version>1.1.3</commons.logging.version>
|
||||||
|
<commons-validator.version>1.7</commons-validator.version>
|
||||||
|
|
||||||
|
<dhp.guava.version>14.0.1</dhp.guava.version>
|
||||||
|
<solr.version>8.11.0</solr.version>
|
||||||
|
<sparksolr.version>4.0.4</sparksolr.version>
|
||||||
|
<dhp.spark.version>3.4.2.openaire</dhp.spark.version>
|
||||||
|
<dhp.jackson.version>2.14.2</dhp.jackson.version>
|
||||||
|
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
|
||||||
|
<log4j.version>2.19.0</log4j.version>
|
||||||
|
<json4s.version>3.7.0-M11</json4s.version>
|
||||||
|
<javassist.version>3.25.0-GA</javassist.version>
|
||||||
|
<okhttp.version>4.10.0</okhttp.version>
|
||||||
|
<org.slf4j.version>2.0.6</org.slf4j.version>
|
||||||
|
<reflections.version>0.10.2</reflections.version>
|
||||||
|
<zookeeper.version>3.6.3</zookeeper.version>
|
||||||
</properties>
|
</properties>
|
||||||
</profile>
|
</profile>
|
||||||
|
|
||||||
<!-- Activate ARM-compatible snappy dependency on new Silicon Macs -->
|
|
||||||
<profile>
|
<profile>
|
||||||
<id>arm-silicon-mac</id>
|
<id>spark-35</id>
|
||||||
|
<properties>
|
||||||
|
<scala.binary.version>2.12</scala.binary.version>
|
||||||
|
<scala.version>2.12.18</scala.version>
|
||||||
|
<scala-xml.version>1.3.0</scala-xml.version>
|
||||||
|
|
||||||
|
<!-- plugin versions -->
|
||||||
|
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
|
||||||
|
|
||||||
|
<!-- dependencies -->
|
||||||
|
<common.compress.version>1.23.0</common.compress.version>
|
||||||
|
<common.csv.version>1.8</common.csv.version>
|
||||||
|
<common.text.version>1.10.0</common.text.version>
|
||||||
|
<commons-beanutils.version>1.9.4</commons-beanutils.version>
|
||||||
|
<commons-codec.version>1.16.0</commons-codec.version>
|
||||||
|
<commons.collections.version>3.2.2</commons.collections.version>
|
||||||
|
<commons-io.version>2.13.0</commons-io.version>
|
||||||
|
<commons.logging.version>1.1.3</commons.logging.version>
|
||||||
|
<commons-validator.version>1.7</commons-validator.version>
|
||||||
|
|
||||||
|
<dhp.guava.version>14.0.1</dhp.guava.version>
|
||||||
|
<solr.version>8.11.0</solr.version>
|
||||||
|
<sparksolr.version>4.0.4</sparksolr.version>
|
||||||
|
<dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
|
||||||
|
<dhp.jackson.version>2.15.2</dhp.jackson.version>
|
||||||
|
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
|
||||||
|
<log4j.version>2.20.0</log4j.version>
|
||||||
|
<json4s.version>3.7.0-M11</json4s.version>
|
||||||
|
<javassist.version>3.25.0-GA</javassist.version>
|
||||||
|
<okhttp.version>4.10.0</okhttp.version>
|
||||||
|
<org.slf4j.version>2.0.7</org.slf4j.version>
|
||||||
|
<reflections.version>0.10.2</reflections.version>
|
||||||
|
<zookeeper.version>3.6.3</zookeeper.version>
|
||||||
|
</properties>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>java11</id>
|
||||||
<activation>
|
<activation>
|
||||||
<os>
|
<jdk>[11</jdk>
|
||||||
<arch>aarch64</arch>
|
|
||||||
<family>mac</family>
|
|
||||||
</os>
|
|
||||||
</activation>
|
</activation>
|
||||||
<dependencyManagement>
|
<build>
|
||||||
<dependencies>
|
<pluginManagement>
|
||||||
<dependency>
|
<plugins>
|
||||||
<groupId>org.xerial.snappy</groupId>
|
<plugin>
|
||||||
<artifactId>snappy-java</artifactId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<version>1.1.8.4</version>
|
<artifactId>maven-surefire-plugin</artifactId>
|
||||||
</dependency>
|
<version>3.0.0-M4</version>
|
||||||
</dependencies>
|
<configuration>
|
||||||
</dependencyManagement>
|
<!-- only for java 11+ to run spark in tests -->
|
||||||
|
<argLine>--add-opens=java.base/java.lang=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.nio=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.security.action=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
|
||||||
|
</argLine>
|
||||||
|
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||||
|
<trimStackTrace>false</trimStackTrace>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</pluginManagement>
|
||||||
|
</build>
|
||||||
</profile>
|
</profile>
|
||||||
</profiles>
|
</profiles>
|
||||||
</project>
|
</project>
|
Loading…
Reference in New Issue