This commit is contained in:
Claudio Atzori 2024-05-27 11:59:02 +02:00
commit 0d5bdb2db0
56 changed files with 2435 additions and 1229 deletions

1
.gitignore vendored
View File

@ -27,3 +27,4 @@ spark-warehouse
/**/.factorypath /**/.factorypath
/**/.scalafmt.conf /**/.scalafmt.conf
/.java-version /.java-version
/dhp-shade-package/dependency-reduced-pom.xml

View File

@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder; mojo.outputFile = testFolder;
// execute // execute
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); try {
mojo.execute();
Assertions.assertTrue(false); // not reached
} catch (Exception e) {
Assertions
.assertTrue(
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
}
} }
@Test @Test

View File

@ -70,10 +70,7 @@
<groupId>com.ibm.icu</groupId> <groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId> <artifactId>icu4j</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
@ -163,7 +160,7 @@
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>${dhp-schemas.artifact}</artifactId> <artifactId>dhp-schemas</artifactId>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class PacePerson.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt"))); "/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }

View File

@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930 * part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930 * concept_rec_id = 656930
* @return response code * @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/ */
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException { public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id, 1); setDepositionId(concept_rec_id, 1);

View File

@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.HttpHeaders; import org.apache.http.HttpHeaders;
import org.joda.time.Instant;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -154,5 +154,13 @@
"unknown":{ "unknown":{
"original":"Unknown", "original":"Unknown",
"inverse":"Unknown" "inverse":"Unknown"
},
"isamongtopnsimilardocuments": {
"original": "IsAmongTopNSimilarDocuments",
"inverse": "HasAmongTopNSimilarDocuments"
},
"hasamongtopnsimilardocuments": {
"original": "HasAmongTopNSimilarDocuments",
"inverse": "IsAmongTopNSimilarDocuments"
} }
} }

View File

@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val master = parser.get("master") val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master") log.info(s"Creating Spark session: Master: $master")
SparkSession val b = SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(master) if (master != null)
.getOrCreate() b.master(master)
b.getOrCreate()
} }
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {

View File

@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
} }
def generateScholixResourceFromResult(r: Result): ScholixResource = { def generateScholixResourceFromResult(r: Result): ScholixResource = {
val sum = ScholixUtils.resultToSummary(r)
if (sum != null)
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
else
null
} }
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
} }
def invRel(rel: String): String = {
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
if (semanticRelation != null)
semanticRelation.inverse
else
null
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty) if (persistentIdentifiers.isEmpty)
return null return null
s.setLocalIdentifier(persistentIdentifiers.asJava) s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication]) // s.setTypology(r.getResulttype.getClassid)
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

View File

@ -24,7 +24,7 @@
<executions> <executions>
<execution> <execution>
<id>scala-compile-first</id> <id>scala-compile-first</id>
<phase>initialize</phase> <phase>process-resources</phase>
<goals> <goals>
<goal>add-source</goal> <goal>add-source</goal>
<goal>compile</goal> <goal>compile</goal>
@ -59,14 +59,6 @@
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
@ -91,10 +83,6 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
@ -113,4 +101,90 @@
</dependency> </dependency>
</dependencies> </dependencies>
<profiles>
<profile>
<id>spark-24</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-34</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-35</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project> </project>

View File

@ -1,12 +1,6 @@
package eu.dnetlib.pace.common; package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
/** /**
* Set of common functions for the framework * Set of common functions for the framework
* *

View File

@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath} import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.MapDocumentUtil import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
df.map(r => rowFromJson(r))(RowEncoder(schema)) df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
} }
def rowFromJson(json: String): Row = { def rowFromJson(json: String): Row = {

View File

@ -0,0 +1,12 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
RowEncoder(schema)
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
ExpressionEncoder(schema)
}
}

View File

@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest { public class UtilTest {

169
dhp-shade-package/pom.xml Normal file
View File

@ -0,0 +1,169 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-shade-package</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module create a jar of all module dependencies</description>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-actionmanager</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-aggregation</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-blacklist</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-broker-events</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-enrichment</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-provision</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-impact-indicators</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-actionsets</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-hist-snaps</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-monitor-irish</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-promote</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-swh</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-raw-data-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-stats-build</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
</transformer>
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/maven/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
<pattern>com</pattern>
<shadedPattern>repackaged.com.google.common</shadedPattern>
<includes>
<include>com.google.common.**</include>
</includes>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -9,6 +9,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
.union(openAPCRelations) .union(openAPCRelations)
.union(dataciteRelations) .union(dataciteRelations)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}); });
} }

View File

@ -10,6 +10,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
resultsRDD resultsRDD
.union(projectsRDD) .union(projectsRDD)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}); });
} }

View File

@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
tp._1 match { tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2) case "electronic" => journal.setIssnOnline(tp._2)
case "print" => journal.setIssnPrinted(tp._2) case "print" => journal.setIssnPrinted(tp._2)
case _ =>
} }
}) })
} }

View File

@ -94,7 +94,8 @@ object MagUtility extends Serializable {
) )
di di
} }
val datatypedict = Map(
val datatypedict = Map(
"bool" -> BooleanType, "bool" -> BooleanType,
"int" -> IntegerType, "int" -> IntegerType,
"uint" -> IntegerType, "uint" -> IntegerType,
@ -505,8 +506,6 @@ val datatypedict = Map(
) )
) )
result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}") result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")

View File

@ -35,8 +35,6 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = { def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
import spark.implicits._ import spark.implicits._
spark.read spark.read
.load(s"$magBasePath/mag_denormalized") .load(s"$magBasePath/mag_denormalized")
.as[MAGPaper] .as[MAGPaper]

View File

@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Aggregator
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.io.InputStream import java.io.{ByteArrayInputStream, InputStream}
import scala.io.Source import java.nio.charset.Charset
import scala.xml.pull.XMLEventReader import javax.xml.stream.XMLInputFactory
object SparkCreateBaselineDataFrame { object SparkCreateBaselineDataFrame {
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} else } else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
} catch { } catch {
case e: Throwable => case e: Throwable =>
println(s"Error on requesting ${r.getURI}") println(s"Error on requesting ${r.getURI}")
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString( IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream( SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
) ),
Charset.defaultCharset()
) )
) )
parser.parseArgument(args) parser.parseArgument(args)
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
log.info("workingPath: {}", workingPath) log.info("workingPath: {}", workingPath)
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion") val targetPath = parser.get("targetPath")
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion) log.info("targetPath: {}", targetPath)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val hdfsServerUri = parser.get("hdfsServerUri") val hdfsServerUri = parser.get("hdfsServerUri")
log.info("hdfsServerUri: {}", hdfsServerUri) log.info("hdfsServerUri: {}", targetPath)
val skipUpdate = parser.get("skipUpdate") val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate) log.info("skipUpdate: {}", skipUpdate)
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) { if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset( val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz")) k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => { .flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
}) })
) )
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
.map(a => PubMedToOaf.convert(a, vocabularies)) .map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf] .as[Oaf]
.filter(p => p != null), .filter(p => p != null),
s"$outputBasePath/$MDSTORE_DATA_PATH" targetPath
) )
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.sx.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} import javax.xml.stream.XMLEventReader
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml /** @param xml
*/ */

View File

@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -119,7 +119,9 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI", workingDir.toString() + "/COCI",
"-outputPath", "-outputPath",
workingDir.toString() + "/COCI_json/", workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4;input5" "-inputFile", "input1;input2;input3;input4;input5",
"-format",
"COCI"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

View File

@ -789,10 +789,6 @@
"value": "2227-9717", "value": "2227-9717",
"type": "electronic" "type": "electronic"
}, },
{
"value": "VALUE",
"type": "PIPPO"
},
{ {
"value": "1063-4584", "value": "1063-4584",
"type": "pu" "type": "pu"

View File

@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import org.junit.jupiter.api.BeforeEach import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
import org.apache.commons.io.IOUtils
import org.junit.jupiter.api.{BeforeEach, Test}
import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.extension.ExtendWith
import org.mockito.junit.jupiter.MockitoExtension import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
super.setUpVocabulary() super.setUpVocabulary()
} }
@Test
def mappingRecord(): Unit = {
val input =
IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
}
} }

View File

@ -7,13 +7,10 @@ import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
class MAGMappingTest { class MAGMappingTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
def mappingTest(): Unit = { def mappingTest(): Unit = {
val spark = SparkSession val spark = SparkSession
@ -26,8 +23,6 @@ class MAGMappingTest {
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF") s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
} }
@Test @Test
def mappingMagType(): Unit = { def mappingMagType(): Unit = {

View File

@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader} import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import javax.xml.stream.XMLInputFactory
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer import scala.collection.mutable.ListBuffer
import scala.io.Source import scala.io.Source
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputXML = Source val inputFactory = XMLInputFactory.newInstance
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
.mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
} }
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testParsingPubmedXML(): Unit = { def testParsingPubmedXML(): Unit = {
val xml = new XMLEventReader( val inputFactory = XMLInputFactory.newInstance
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
val parser = new PMParser(xml) val parser = new PMParser(xml)
parser.foreach(checkPMArticle) parser.foreach(checkPMArticle)
} }
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
@Test @Test
def testPubmedMapping(): Unit = { def testPubmedMapping(): Unit = {
val xml = new XMLEventReader( val inputFactory = XMLInputFactory.newInstance
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val parser = new PMParser(xml) val parser = new PMParser(xml)
val results = ListBuffer[Oaf]() val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))

View File

@ -53,24 +53,10 @@
<artifactId>dhp-pace-core</artifactId> <artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
<version>2.11.0</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId> <artifactId>spark-core_${scala.binary.version}</artifactId>
@ -79,16 +65,10 @@
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId> <artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_${scala.binary.version}</artifactId> <artifactId>spark-graphx_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId>
</dependency>
<dependency> <dependency>
<groupId>dom4j</groupId> <groupId>dom4j</groupId>
<artifactId>dom4j</artifactId> <artifactId>dom4j</artifactId>
@ -101,10 +81,6 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>

View File

@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple3; import scala.Tuple3;
import scala.collection.JavaConversions; import scala.collection.JavaConversions;
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
Dataset<Row> pivotHistory = spark Dataset<Row> pivotHistory = spark
.createDataset( .createDataset(
Collections.emptyList(), Collections.emptyList(),
RowEncoder SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
if (StringUtils.isNotBlank(pivotHistoryDatabase)) { if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
pivotHistory = spark pivotHistory = spark

View File

@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils; import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.SparkCompatUtils;
import scala.Tuple2; import scala.Tuple2;
import scala.Tuple3; import scala.Tuple3;
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
StructType idsSchema = StructType StructType idsSchema = StructType
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>"); .fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema)); Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
for (EntityType entityType : ModelSupport.entityTypes.keySet()) { for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
String entityPath = graphBasePath + '/' + entityType.name(); String entityPath = graphBasePath + '/' + entityType.name();

View File

@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
subject.getQualifier().setClassname(vocabulary.getName()); subject.getQualifier().setClassname(vocabulary.getName());
} }
} else { } else {
final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo()) final String provenanceActionClassId = Optional
.ofNullable(subject.getDataInfo())
.map(DataInfo::getProvenanceaction) .map(DataInfo::getProvenanceaction)
.map(Qualifier::getClassid) .map(Qualifier::getClassid)
.orElse(null); .orElse(null);

View File

@ -0,0 +1,5 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
]

View File

@ -0,0 +1,166 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
},
"isamongtopnsimilardocuments": {
"original": "IsAmongTopNSimilarDocuments",
"inverse": "HasAmongTopNSimilarDocuments"
},
"hasamongtopnsimilardocuments": {
"original": "HasAmongTopNSimilarDocuments",
"inverse": "IsAmongTopNSimilarDocuments"
}
}

View File

@ -0,0 +1,258 @@
package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix.{
Scholix,
ScholixCollectedFrom,
ScholixEntityId,
ScholixIdentifier,
ScholixRelationship,
ScholixResource
}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
case class RelationInfo(
source: String,
target: String,
relclass: String,
id: String,
collectedfrom: Seq[RelKeyValue]
) {}
case class RelKeyValue(key: String, value: String) {}
object ScholexplorerUtils {
val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
val mapper = new ObjectMapper()
case class RelationVocabulary(original: String, inverse: String) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
json.extract[Map[String, RelationVocabulary]]
}
def invRel(rel: String): String = {
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
if (semanticRelation != null)
semanticRelation.inverse
else
null
}
def generateDatasourceOpenAIREURLS(id: String): String = {
if (id != null && id.length > 12)
s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
else
null
}
def findURLForPID(
pidValue: List[StructuredProperty],
urls: List[String]
): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
(p, r.orNull)
}
}
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
}
def generateScholixResourceFromResult(result: Result): ScholixResource = {
if (result.getInstance() == null || result.getInstance().size() == 0)
return null
if (result.getPid == null || result.getPid.isEmpty)
return null
val r = new ScholixResource
r.setDnetIdentifier(result.getId)
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
if (persistentIdentifiers.isEmpty)
return null
r.setIdentifier(persistentIdentifiers.asJava)
r.setObjectType(result.getResulttype.getClassid)
r.setObjectSubType(
result
.getInstance()
.asScala
.filter(i => i != null && i.getInstancetype != null)
.map(i => i.getInstancetype.getClassname)
.distinct
.head
)
if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
if (titles.nonEmpty)
r.setTitle(titles.head)
else
return null
}
if (result.getAuthor != null && !result.getAuthor.isEmpty) {
val authors: List[ScholixEntityId] =
result.getAuthor.asScala
.map(a => {
val entity = new ScholixEntityId()
entity.setName(a.getFullname)
if (a.getPid != null && a.getPid.size() > 0)
entity.setIdentifiers(
a.getPid.asScala
.map(sp => {
val id = new ScholixIdentifier()
id.setIdentifier(sp.getValue)
id.setSchema(sp.getQualifier.getClassid)
id
})
.take(3)
.toList
.asJava
)
entity
})
.toList
if (authors.nonEmpty)
r.setCreator(authors.asJava)
}
val dt: List[String] = result
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty)
r.setPublicationDate(dt.distinct.head)
r.setPublisher(
result
.getInstance()
.asScala
.map(i => i.getHostedby)
.filter(h => !"unknown".equalsIgnoreCase(h.getValue))
.map(h => {
val eid = new ScholixEntityId()
eid.setName(h.getValue)
val id = new ScholixIdentifier()
id.setIdentifier(h.getKey)
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
eid.setIdentifiers(List(id).asJava)
eid
})
.distinct
.asJava
)
r.setCollectedFrom(
result.getCollectedfrom.asScala
.map(cf => {
val scf = new ScholixCollectedFrom()
scf.setProvisionMode("collected")
scf.setCompletionStatus("complete")
val eid = new ScholixEntityId()
eid.setName(cf.getValue)
val id = new ScholixIdentifier()
id.setIdentifier(cf.getKey)
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
eid.setIdentifiers(List(id).asJava)
scf.setProvider(eid)
scf
})
.asJava
)
r
}
def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
val s: Scholix = new Scholix
s.setSource(source)
if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
s.setLinkprovider(
relation.collectedfrom
.map(cf => {
val eid = new ScholixEntityId()
eid.setName(cf.value)
val id = new ScholixIdentifier()
id.setIdentifier(cf.key)
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
eid.setIdentifiers(List(id).asJava)
eid
})
.toList
.asJava
)
else {
val eid = new ScholixEntityId()
eid.setName("OpenAIRE")
val id = new ScholixIdentifier()
id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
eid.setIdentifiers(List(id).asJava)
s.setLinkprovider(List(eid).asJava)
}
s.setIdentifier(relation.id)
val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setPublicationDate(source.getPublicationDate)
s.setPublisher(source.getPublisher)
val mockTarget = new ScholixResource
mockTarget.setDnetIdentifier(relation.target)
s.setTarget(mockTarget)
s
}
def updateTarget(s: Scholix, t: ScholixResource): String = {
s.setTarget(t)
val spublishers: Seq[ScholixEntityId] =
if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
val tpublishers: Seq[ScholixEntityId] =
if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
s.setPublisher(mergedPublishers.asJava)
mapper.writeValueAsString(s)
}
}

View File

@ -0,0 +1,141 @@
package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.schema.oaf.{
KeyValue,
OtherResearchProduct,
Publication,
Relation,
Result,
Software,
Dataset => OafDataset
}
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val sourcePath = parser.get("sourcePath")
log.info("sourcePath: {}", sourcePath)
val targetPath = parser.get("targetPath")
log.info("targetPath: {}", targetPath)
generateBidirectionalRelations(sourcePath, targetPath, spark)
generateScholixResource(sourcePath, targetPath, spark)
generateScholix(targetPath, spark)
}
def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
val entityMap: Map[String, StructType] = Map(
"publication" -> Encoders.bean(classOf[Publication]).schema,
"dataset" -> Encoders.bean(classOf[OafDataset]).schema,
"software" -> Encoders.bean(classOf[Software]).schema,
"otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
)
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
val resDs = spark.emptyDataset[ScholixResource]
val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
println(s"adding ${item._1}")
res.union(
spark.read
.schema(item._2)
.json(s"$inputPath/${item._1}")
.as[Result]
.map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
.filter(s => s != null)
)
})
scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
}
def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
val relSchema = Encoders.bean(classOf[Relation]).schema
val relDF = spark.read
.schema(relSchema)
.json(s"$inputPath/relation")
.where(
"datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
"and relClass <> 'merges' and relClass <> 'isMergedIn'"
)
.select("source", "target", "collectedfrom", "relClass")
def invRel: String => String = { s =>
ScholexplorerUtils.invRel(s)
}
import org.apache.spark.sql.functions.udf
val inverseRelationUDF = udf(invRel)
val inverseRelation = relDF.select(
col("target").alias("source"),
col("source").alias("target"),
col("collectedfrom"),
inverseRelationUDF(col("relClass")).alias("relClass")
)
val bidRel = inverseRelation
.union(relDF)
.withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
.withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
.drop("collectedfrom")
.withColumnRenamed("cf", "collectedfrom")
.groupBy(col("id"))
.agg(
first("source").alias("source"),
first("target").alias("target"),
first("relClass").alias("relClass"),
first("collectedfrom").alias("collectedfrom")
)
bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
}
def generateScholix(outputPath: String, spark: SparkSession): Unit = {
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
import spark.implicits._
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
val scholix_one_verse = relations
.joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
.map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
val resourceTarget = relations
.joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
.map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
scholix_one_verse
.joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
.map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
.write
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.text(s"$outputPath/scholix")
}
}
object SparkCreateScholexplorerDump {
val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
def main(args: Array[String]): Unit = {
new SparkCreateScholexplorerDump(
log = logger,
args = args,
propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
).initialize().run()
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Test
import org.objenesis.strategy.StdInstantiatorStrategy
class ScholixGenerationTest {
@Test
def generateScholix(): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
val app = new SparkCreateScholexplorerDump(null, null, null)
// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
// app.generateBidirectionalRelations(
// "/home/sandro/Downloads/scholix_sample/",
// "/home/sandro/Downloads/scholix/",
// spark
// )
app.generateScholix("/home/sandro/Downloads/scholix/", spark)
}
}

View File

@ -18,7 +18,7 @@
<executions> <executions>
<execution> <execution>
<id>scala-compile-first</id> <id>scala-compile-first</id>
<phase>initialize</phase> <phase>process-resources</phase>
<goals> <goals>
<goal>add-source</goal> <goal>add-source</goal>
<goal>compile</goal> <goal>compile</goal>
@ -59,12 +59,6 @@
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>dom4j</groupId> <groupId>dom4j</groupId>
@ -160,6 +154,26 @@
<groupId>org.apache.zookeeper</groupId> <groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId> <artifactId>zookeeper</artifactId>
</exclusion> </exclusion>
<exclusion>
<artifactId>ant</artifactId>
<groupId>org.apache.ant</groupId>
</exclusion>
<exclusion>
<artifactId>antlr4-runtime</artifactId>
<groupId>org.antlr</groupId>
</exclusion>
<exclusion>
<artifactId>woodstox-core</artifactId>
<groupId>com.fasterxml.woodstox</groupId>
</exclusion>
<exclusion>
<artifactId>log4j</artifactId>
<groupId>*</groupId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
@ -206,5 +220,90 @@
</dependencies> </dependencies>
<profiles>
<profile>
<id>spark-24</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/sparksolr-3</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-34</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/sparksolr-4</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/sparksolr-4</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project> </project>

View File

@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper; import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
.javaRDD() .javaRDD()
.map( .map(
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson())); t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
} }
} }

View File

@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import javax.swing.text.html.Option;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.stringtemplate.v4.ST; import org.stringtemplate.v4.ST;

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.sparksolr;
import com.lucidworks.spark.util.SolrSupport;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.rdd.RDD;
public class DHPSolrSupport {
static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
SolrSupport.indexDocs(zkhost, collection, batchSize, docs);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.sparksolr;
import com.lucidworks.spark.util.SolrSupport;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.rdd.RDD;
public class DHPSolrSupport {
static public void indexDocs(String zkhost, String collection, int batchSize, RDD<SolrInputDocument> docs) {
SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs);
}
}

View File

@ -16,11 +16,11 @@
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -10,11 +10,11 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -8,6 +8,8 @@ fi
export HADOOP_USER_NAME=$2 export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
# Set the active HDFS node of OCEAN and IMPALA cluster. # Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1' OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -30,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1 exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -39,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments. # Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' function print_elapsed_time()
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' {
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' start_time=$1
end_time=$(date +%s)
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' elapsed_time=$(($end_time-$start_time))
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' hours=$((elapsed_time / 3600))
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
@ -67,8 +70,10 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
fi fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -91,8 +96,10 @@ function copydb() {
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3 exit 3
fi fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -109,17 +116,13 @@ function copydb() {
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -127,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi fi
fi fi
fi fi
@ -176,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else else
@ -204,11 +214,14 @@ function copydb() {
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
exit 6 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi fi
rm -f error.log rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n" echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -10,11 +10,11 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2 export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
# Set the active HDFS node of OCEAN and IMPALA cluster. # Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1' OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1 exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments. # Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' function print_elapsed_time()
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' {
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' start_time=$1
end_time=$(date +%s)
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' elapsed_time=$(($end_time-$start_time))
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' hours=$((elapsed_time / 3600))
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
@ -66,8 +70,10 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
fi fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -90,8 +96,10 @@ function copydb() {
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3 exit 3
fi fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,17 +116,13 @@ function copydb() {
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -126,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi fi
fi fi
fi fi
@ -175,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else else
@ -203,11 +214,14 @@ function copydb() {
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
exit 6 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi fi
rm -f error.log rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n" echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -10,11 +10,11 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId> <artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -8,6 +8,9 @@ fi
export HADOOP_USER_NAME=$2 export HADOOP_USER_NAME=$2
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
# Set the active HDFS node of OCEAN and IMPALA cluster. # Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1' OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1 exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -38,26 +43,25 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments. # Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' function print_elapsed_time()
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' {
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' start_time=$1
end_time=$(date +%s)
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' elapsed_time=$(($end_time-$start_time))
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' hours=$((elapsed_time / 3600))
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
@ -66,8 +70,10 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
fi fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -90,8 +96,10 @@ function copydb() {
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3 exit 3
fi fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -108,17 +116,13 @@ function copydb() {
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -126,12 +130,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi fi
fi fi
fi fi
@ -175,7 +184,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else else
@ -203,11 +214,14 @@ function copydb() {
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
exit 6 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi fi
rm -f error.log rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n" echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }

View File

@ -6,6 +6,8 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi fi
SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0
# Set the active HDFS node of OCEAN and IMPALA cluster. # Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1' OCEAN_HDFS_NODE='hdfs://nameservice1'
@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do
done done
if [ -z "$IMPALA_HDFS_NODE" ]; then if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 1 exit 1
fi
fi fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
@ -40,26 +44,26 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments. # Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
export HADOOP_USER_NAME=$6 export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
function print_elapsed_time()
{
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time-$start_time))
hours=$((elapsed_time / 3600))
minutes=$(((elapsed_time % 3600) / 60))
seconds=$((elapsed_time % 60))
printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds
}
function copydb() { function copydb() {
db=$1 db=$1
start_db_time=$(date +%s)
echo -e "\nStart processing db: '${db}'..\n" echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
@ -68,8 +72,10 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 2 exit 2
fi fi
fi
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
@ -92,8 +98,10 @@ function copydb() {
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 3 exit 3
fi fi
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
@ -110,17 +118,13 @@ function copydb() {
num_tables=0 num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"`
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
@ -128,12 +132,17 @@ function copydb() {
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 4
fi
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 5
fi
fi fi
fi fi
fi fi
@ -177,7 +186,9 @@ function copydb() {
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
exit 5 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 6
fi
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
else else
@ -205,11 +216,14 @@ function copydb() {
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
exit 6 if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
exit 7
fi
fi fi
rm -f error.log rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n" echo -e "\n\nFinished processing db: ${db}\n"
print_elapsed_time start_db_time
} }
STATS_DB=$1 STATS_DB=$1

View File

@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as
with with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
from lvl1 from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;

View File

@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
.map( .map(
(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class), (MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
Encoders.bean(Software.class)) Encoders.bean(Software.class))
.filter(t -> t.getCodeRepositoryUrl() != null) .filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl")); .select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
} }

View File

@ -39,8 +39,8 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version> <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version> <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
</properties> </properties>
<dependencies> <dependencies>
@ -72,6 +72,12 @@
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
<version>${cdh.hadoop.version}</version> <version>${cdh.hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>

View File

@ -39,8 +39,8 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<cdh.hive.version>0.13.1-cdh5.2.1</cdh.hive.version> <cdh.hive.version>1.1.0-cdh5.16.2</cdh.hive.version>
<cdh.hadoop.version>2.5.0-cdh5.2.1</cdh.hadoop.version> <cdh.hadoop.version>2.6.0-cdh5.16.2</cdh.hadoop.version>
</properties> </properties>
<dependencies> <dependencies>
@ -67,11 +67,23 @@
<groupId>org.apache.hive</groupId> <groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId> <artifactId>hive-jdbc</artifactId>
<version>${cdh.hive.version}</version> <version>${cdh.hive.version}</version>
<exclusions>
<exclusion>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId> <artifactId>hadoop-common</artifactId>
<version>${cdh.hadoop.version}</version> <version>${cdh.hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>

306
pom.xml
View File

@ -13,7 +13,8 @@
<distribution>repo</distribution> <distribution>repo</distribution>
<comments>This program is free software: you can redistribute it and/or modify it under the terms of the <comments>This program is free software: you can redistribute it and/or modify it under the terms of the
GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.</comments> License, or (at your option) any later version.
</comments>
</license> </license>
</licenses> </licenses>
@ -22,6 +23,7 @@
<module>dhp-pace-core</module> <module>dhp-pace-core</module>
<module>dhp-common</module> <module>dhp-common</module>
<module>dhp-workflows</module> <module>dhp-workflows</module>
<module>dhp-shade-package</module>
</modules> </modules>
<issueManagement> <issueManagement>
@ -47,6 +49,19 @@
</pluginRepositories> </pluginRepositories>
<repositories> <repositories>
<repository>
<id>Openaire-third-parties-snaphot</id>
<name>Openaire third parties Snapshot</name>
<url>https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository> <repository>
<id>dnet45-releases</id> <id>dnet45-releases</id>
<name>D-Net 45 releases</name> <name>D-Net 45 releases</name>
@ -125,6 +140,13 @@
</repositories> </repositories>
<dependencies> <dependencies>
<!-- Quick FIX not to remove lombok everywhere -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
<scope>provided</scope>
</dependency>
<dependency> <dependency>
<groupId>org.junit.jupiter</groupId> <groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId> <artifactId>junit-jupiter</artifactId>
@ -152,7 +174,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>${dhp-schemas.artifact}</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${dhp-schemas.version}</version> <version>${dhp-schemas.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -206,33 +228,76 @@
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId> <artifactId>slf4j-api</artifactId>
<version>1.7.25</version> <version>${org.slf4j.version}</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${org.slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>${org.slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<!-- API bridge between log4j 1 and 2 -->
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
<version>${dhp.commons.lang.version}</version> <version>${dhp.commons.lang.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-beanutils</artifactId>
<version>${commons-beanutils.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-validator</groupId> <groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId> <artifactId>commons-validator</artifactId>
<version>1.7</version> <version>${commons-validator.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.sisyphsu</groupId> <groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
<version>1.0.7</version> <version>${dateparser.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>me.xuender</groupId> <groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId> <artifactId>unidecode</artifactId>
<version>0.0.7</version> <version>${unidecode.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -245,13 +310,13 @@
<dependency> <dependency>
<groupId>commons-codec</groupId> <groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId> <artifactId>commons-codec</artifactId>
<version>1.9</version> <version>${commons-codec.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
<version>2.4</version> <version>${commons-io.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -345,7 +410,7 @@
<dependency> <dependency>
<groupId>org.apache.zookeeper</groupId> <groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId> <artifactId>zookeeper</artifactId>
<version>3.4.11</version> <version>${zookeeper.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -415,6 +480,7 @@
<artifactId>cxf-rt-transports-http</artifactId> <artifactId>cxf-rt-transports-http</artifactId>
<version>3.1.5</version> <version>3.1.5</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>javax.persistence</groupId> <groupId>javax.persistence</groupId>
<artifactId>javax.persistence-api</artifactId> <artifactId>javax.persistence-api</artifactId>
@ -504,16 +570,11 @@
<artifactId>commons-compress</artifactId> <artifactId>commons-compress</artifactId>
<version>${common.compress.version}</version> <version>${common.compress.version}</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId> <artifactId>commons-csv</artifactId>
<version>${common.csv.version}</version> <version>${common.csv.version}</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency> <dependency>
<groupId>org.apache.poi</groupId> <groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId> <artifactId>poi-ooxml</artifactId>
@ -568,14 +629,12 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId> <artifactId>commons-math3</artifactId>
<version>3.6.1</version> <version>3.6.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.google.code.gson</groupId> <groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId> <artifactId>gson</artifactId>
@ -596,7 +655,7 @@
<dependency> <dependency>
<groupId>org.reflections</groupId> <groupId>org.reflections</groupId>
<artifactId>reflections</artifactId> <artifactId>reflections</artifactId>
<version>0.9.10</version> <version>${reflections.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -610,6 +669,12 @@
<artifactId>icu4j</artifactId> <artifactId>icu4j</artifactId>
<version>70.1</version> <version>70.1</version>
</dependency> </dependency>
<dependency>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
<version>${javassist.version}</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -677,6 +742,7 @@
<version>3.0.0-M4</version> <version>3.0.0-M4</version>
<configuration> <configuration>
<redirectTestOutputToFile>true</redirectTestOutputToFile> <redirectTestOutputToFile>true</redirectTestOutputToFile>
<trimStackTrace>false</trimStackTrace>
</configuration> </configuration>
</plugin> </plugin>
<plugin> <plugin>
@ -746,7 +812,7 @@
<plugin> <plugin>
<groupId>net.revelc.code</groupId> <groupId>net.revelc.code</groupId>
<artifactId>impsort-maven-plugin</artifactId> <artifactId>impsort-maven-plugin</artifactId>
<version>1.4.1</version> <version>1.6.2</version>
<configuration> <configuration>
<groups>java.,javax.,org.,com.</groups> <groups>java.,javax.,org.,com.</groups>
<staticGroups>java,*</staticGroups> <staticGroups>java,*</staticGroups>
@ -767,7 +833,9 @@
<groupId>org.antipathy</groupId> <groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId> <artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
<configuration> <configuration>
<configLocation>https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation> <configLocation>
https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf
</configLocation>
<skipTestSources>false</skipTestSources> <skipTestSources>false</skipTestSources>
<skipSources>false</skipSources> <skipSources>false</skipSources>
<sourceDirectories> <sourceDirectories>
@ -798,7 +866,7 @@
<plugin> <plugin>
<groupId>org.jacoco</groupId> <groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId> <artifactId>jacoco-maven-plugin</artifactId>
<version>0.7.9</version> <version>0.8.10</version>
<configuration> <configuration>
<excludes> <excludes>
<exclude>**/schemas/*</exclude> <exclude>**/schemas/*</exclude>
@ -866,90 +934,174 @@
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path> <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
<maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.target>1.8</maven.compiler.target>
<maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version> <!-- scala version -->
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
<sparksolr.version>3.6.0</sparksolr.version>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.site.skip>true</dhp.site.skip>
<dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version> <scala.version>2.11.12</scala.version>
<scala.binary.version>2.11</scala.binary.version> <scala.binary.version>2.11</scala.binary.version>
<scala-xml.version>1.3.0</scala-xml.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version> <!-- plugin versions -->
<mockito-core.version>3.3.3</mockito-core.version> <maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <maven.failsave.plugin.version>2.22.2</maven.failsave.plugin.version>
<vtd.version>[2.12,3.0)</vtd.version> <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
<!-- dependency versions -->
<apache.poi.version>4.1.2</apache.poi.version>
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
<common.compress.version>1.20</common.compress.version>
<common.csv.version>1.8</common.csv.version>
<common.text.version>1.8</common.text.version>
<commons-beanutils.version>1.9.4</commons-beanutils.version>
<commons-codec.version>1.9</commons-codec.version>
<commons.collections.version>3.2.1</commons.collections.version>
<commons-io.version>2.4</commons-io.version>
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[6.1.2]</dhp-schemas.version> <dhp-schemas.version>[6.1.2]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp.site.skip>true</dhp.site.skip>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version> <google.gson.version>2.2.2</google.gson.version>
<solr.version>7.5.0</solr.version> <log4j.version>1.2.17</log4j.version>
<okhttp.version>4.7.2</okhttp.version> <javassist.version>3.19.0-GA</javassist.version>
<common.compress.version>1.20</common.compress.version>
<json4s.version>3.5.3</json4s.version> <json4s.version>3.5.3</json4s.version>
<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version> <jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
<common.csv.version>1.8</common.csv.version> <junit-jupiter.version>5.6.1</junit-jupiter.version>
<apache.poi.version>4.1.2</apache.poi.version> <mockito-core.version>3.3.3</mockito-core.version>
<common.text.version>1.8</common.text.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<okhttp.version>4.7.2</okhttp.version>
<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version> <org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version> <org.slf4j.version>1.7.25</org.slf4j.version>
<google.gson.version>2.2.2</google.gson.version> <reflections.version>0.9.10</reflections.version>
<commons.logging.version>1.1.3</commons.logging.version> <scala-xml.version>1.3.0</scala-xml.version>
<commons.collections.version>3.2.1</commons.collections.version> <solr.version>7.5.0</solr.version>
<sparksolr.version>3.6.0</sparksolr.version>
<unidecode.version>0.0.7</unidecode.version>
<vtd.version>[2.12,3.0)</vtd.version>
<zookeeper.version>3.4.6</zookeeper.version>
</properties> </properties>
<!-- Build with scala 12 and Spark 3.4 --> <!-- Build with scala 12 and Spark 3.4 -->
<profiles> <profiles>
<profile> <profile>
<id>scala-2.12</id> <id>spark-34</id>
<properties> <properties>
<scala.binary.version>2.12</scala.binary.version> <scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.18</scala.version> <scala.version>2.12.18</scala.version>
<!-- scala-xml.version>2.1.0</scala-xml.version --> <scala-xml.version>1.3.0</scala-xml.version>
<!-- plugin versions -->
<sparksolr.version>4.0.2</sparksolr.version>
<dhp.spark.version>3.4.1</dhp.spark.version>
<dhp.jackson.version>2.14.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<json4s.version>3.7.0-M11</json4s.version>
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version> <net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
<!-- <!-- dependencies -->
<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact> <common.compress.version>1.22</common.compress.version>
<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version> <common.csv.version>1.8</common.csv.version>
--> <common.text.version>1.10.0</common.text.version>
<commons-beanutils.version>1.9.4</commons-beanutils.version>
<commons-codec.version>1.15</commons-codec.version>
<commons.collections.version>3.2.2</commons.collections.version>
<commons-io.version>2.11.0</commons-io.version>
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dhp.guava.version>14.0.1</dhp.guava.version>
<solr.version>8.11.0</solr.version>
<sparksolr.version>4.0.4</sparksolr.version>
<dhp.spark.version>3.4.2.openaire</dhp.spark.version>
<dhp.jackson.version>2.14.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<log4j.version>2.19.0</log4j.version>
<json4s.version>3.7.0-M11</json4s.version>
<javassist.version>3.25.0-GA</javassist.version>
<okhttp.version>4.10.0</okhttp.version>
<org.slf4j.version>2.0.6</org.slf4j.version>
<reflections.version>0.10.2</reflections.version>
<zookeeper.version>3.6.3</zookeeper.version>
</properties> </properties>
</profile> </profile>
<!-- Activate ARM-compatible snappy dependency on new Silicon Macs -->
<profile> <profile>
<id>arm-silicon-mac</id> <id>spark-35</id>
<properties>
<scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.18</scala.version>
<scala-xml.version>1.3.0</scala-xml.version>
<!-- plugin versions -->
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
<!-- dependencies -->
<common.compress.version>1.23.0</common.compress.version>
<common.csv.version>1.8</common.csv.version>
<common.text.version>1.10.0</common.text.version>
<commons-beanutils.version>1.9.4</commons-beanutils.version>
<commons-codec.version>1.16.0</commons-codec.version>
<commons.collections.version>3.2.2</commons.collections.version>
<commons-io.version>2.13.0</commons-io.version>
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dhp.guava.version>14.0.1</dhp.guava.version>
<solr.version>8.11.0</solr.version>
<sparksolr.version>4.0.4</sparksolr.version>
<dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
<dhp.jackson.version>2.15.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<log4j.version>2.20.0</log4j.version>
<json4s.version>3.7.0-M11</json4s.version>
<javassist.version>3.25.0-GA</javassist.version>
<okhttp.version>4.10.0</okhttp.version>
<org.slf4j.version>2.0.7</org.slf4j.version>
<reflections.version>0.10.2</reflections.version>
<zookeeper.version>3.6.3</zookeeper.version>
</properties>
</profile>
<profile>
<id>java11</id>
<activation> <activation>
<os> <jdk>[11</jdk>
<arch>aarch64</arch>
<family>mac</family>
</os>
</activation> </activation>
<dependencyManagement> <build>
<dependencies> <pluginManagement>
<dependency> <plugins>
<groupId>org.xerial.snappy</groupId> <plugin>
<artifactId>snappy-java</artifactId> <groupId>org.apache.maven.plugins</groupId>
<version>1.1.8.4</version> <artifactId>maven-surefire-plugin</artifactId>
</dependency> <version>3.0.0-M4</version>
</dependencies> <configuration>
</dependencyManagement> <!-- only for java 11+ to run spark in tests -->
<argLine>--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
--add-opens=java.base/sun.security.action=ALL-UNNAMED
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
</argLine>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
<trimStackTrace>false</trimStackTrace>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</profile> </profile>
</profiles> </profiles>
</project> </project>