forked from D-Net/dnet-hadoop
Add profiles for different spark versions: spark-24, spark-34, spark-35
This commit is contained in:
parent
52495f2cd2
commit
613ec5ffce
|
@ -38,7 +38,7 @@ public class PacePerson {
|
|||
PacePerson.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
|
@ -95,4 +95,90 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>spark-24</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-2</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>spark-34</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-2</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>spark-35</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-35</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -2,11 +2,10 @@ package eu.dnetlib.pace.model
|
|||
|
||||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
|
@ -49,7 +48,7 @@ case class SparkModel(conf: DedupConfig) {
|
|||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||
|
||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
||||
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||
}
|
||||
|
||||
def rowFromJson(json: String): Row = {
|
||||
|
|
|
@ -18,7 +18,6 @@ package eu.dnetlib.pace.util;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Diff Match and Patch
|
||||
* Copyright 2018 The diff-match-patch Authors.
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.pace.util
|
||||
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
|
||||
object SparkCompatUtils {
|
||||
|
||||
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||
RowEncoder(schema)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.pace.util
|
||||
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
import org.apache.spark.sql.types.StructType
|
||||
|
||||
object SparkCompatUtils {
|
||||
|
||||
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||
ExpressionEncoder(schema)
|
||||
}
|
||||
}
|
|
@ -155,7 +155,8 @@ object SparkCreateBaselineDataFrame {
|
|||
IOUtils.toString(
|
||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||
),Charset.defaultCharset()
|
||||
),
|
||||
Charset.defaultCharset()
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
|
@ -198,7 +199,7 @@ object SparkCreateBaselineDataFrame {
|
|||
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||
k.filter(i => i._1.endsWith(".gz"))
|
||||
.flatMap(i => {
|
||||
val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
|
||||
val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
})
|
||||
)
|
||||
|
|
|
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
|
|
@ -119,7 +119,9 @@ public class ReadCOCITest {
|
|||
workingDir.toString() + "/COCI",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/COCI_json/",
|
||||
"-inputFile", "input1;input2;input3;input4;input5"
|
||||
"-inputFile", "input1;input2;input3;input4;input5",
|
||||
"-format",
|
||||
"COCI"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
|
|
@ -162,6 +162,18 @@
|
|||
<artifactId>antlr4-runtime</artifactId>
|
||||
<groupId>org.antlr</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>woodstox-core</artifactId>
|
||||
<groupId>com.fasterxml.woodstox</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>log4j</artifactId>
|
||||
<groupId>*</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -210,7 +222,7 @@
|
|||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>scala-2.11</id>
|
||||
<id>spark-24</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
|
@ -240,7 +252,7 @@
|
|||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>scala-2.12</id>
|
||||
<id>spark-34</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
|
@ -266,6 +278,32 @@
|
|||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>spark-35</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/sparksolr-4</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
</project>
|
|
@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
|
|||
.map(
|
||||
(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.filter(t -> t.getCodeRepositoryUrl() != null)
|
||||
.filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
|
||||
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
|
||||
}
|
||||
|
||||
|
|
94
pom.xml
94
pom.xml
|
@ -174,7 +174,7 @@
|
|||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${dhp-schemas.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
@ -233,6 +233,13 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>${org.slf4j.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>jcl-over-slf4j</artifactId>
|
||||
|
@ -240,6 +247,28 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<!-- API bridge between log4j 1 and 2 -->
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-1.2-api</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
@ -381,7 +410,7 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<version>3.4.11</version>
|
||||
<version>${zookeeper.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -713,6 +742,7 @@
|
|||
<version>3.0.0-M4</version>
|
||||
<configuration>
|
||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||
<trimStackTrace>false</trimStackTrace>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
|
@ -782,7 +812,7 @@
|
|||
<plugin>
|
||||
<groupId>net.revelc.code</groupId>
|
||||
<artifactId>impsort-maven-plugin</artifactId>
|
||||
<version>1.4.1</version>
|
||||
<version>1.6.2</version>
|
||||
<configuration>
|
||||
<groups>java.,javax.,org.,com.</groups>
|
||||
<staticGroups>java,*</staticGroups>
|
||||
|
@ -918,8 +948,6 @@
|
|||
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
|
||||
|
||||
<!-- dependency versions -->
|
||||
<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
|
||||
|
||||
<apache.poi.version>4.1.2</apache.poi.version>
|
||||
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
|
||||
<common.compress.version>1.20</common.compress.version>
|
||||
|
@ -932,7 +960,7 @@
|
|||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
<commons-validator.version>1.7</commons-validator.version>
|
||||
<dateparser.version>1.0.7</dateparser.version>
|
||||
<dhp-schemas.version>[3.17.1]</dhp-schemas.version>
|
||||
<dhp-schemas.version>4.17.2</dhp-schemas.version>
|
||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||
|
@ -945,6 +973,7 @@
|
|||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
<google.gson.version>2.2.2</google.gson.version>
|
||||
<log4j.version>1.2.17</log4j.version>
|
||||
<javassist.version>3.19.0-GA</javassist.version>
|
||||
<json4s.version>3.5.3</json4s.version>
|
||||
<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
|
||||
|
@ -960,12 +989,13 @@
|
|||
<sparksolr.version>3.6.0</sparksolr.version>
|
||||
<unidecode.version>0.0.7</unidecode.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<zookeeper.version>3.4.6</zookeeper.version>
|
||||
</properties>
|
||||
|
||||
<!-- Build with scala 12 and Spark 3.4 -->
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>scala-2.12</id>
|
||||
<id>spark-34</id>
|
||||
<properties>
|
||||
<scala.binary.version>2.12</scala.binary.version>
|
||||
<scala.version>2.12.18</scala.version>
|
||||
|
@ -988,25 +1018,60 @@
|
|||
<dhp.guava.version>14.0.1</dhp.guava.version>
|
||||
<solr.version>8.11.0</solr.version>
|
||||
<sparksolr.version>4.0.4</sparksolr.version>
|
||||
<dhp.spark.version>3.4.2.openaire-SNAPSHOT</dhp.spark.version>
|
||||
<dhp.spark.version>3.4.2.openaire</dhp.spark.version>
|
||||
<dhp.jackson.version>2.14.2</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
|
||||
<log4j.version>2.19.0</log4j.version>
|
||||
<json4s.version>3.7.0-M11</json4s.version>
|
||||
<javassist.version>3.25.0-GA</javassist.version>
|
||||
<okhttp.version>4.10.0</okhttp.version>
|
||||
<org.slf4j.version>2.0.6</org.slf4j.version>
|
||||
<reflections.version>0.10.2</reflections.version>
|
||||
<!--
|
||||
<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
|
||||
<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
|
||||
-->
|
||||
<zookeeper.version>3.6.3</zookeeper.version>
|
||||
</properties>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>java17</id>
|
||||
<id>spark-35</id>
|
||||
<properties>
|
||||
<scala.binary.version>2.12</scala.binary.version>
|
||||
<scala.version>2.12.18</scala.version>
|
||||
<scala-xml.version>1.3.0</scala-xml.version>
|
||||
|
||||
<!-- plugin versions -->
|
||||
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
|
||||
|
||||
<!-- dependencies -->
|
||||
<common.compress.version>1.23.0</common.compress.version>
|
||||
<common.csv.version>1.8</common.csv.version>
|
||||
<common.text.version>1.10.0</common.text.version>
|
||||
<commons-beanutils.version>1.9.4</commons-beanutils.version>
|
||||
<commons-codec.version>1.16.0</commons-codec.version>
|
||||
<commons.collections.version>3.2.2</commons.collections.version>
|
||||
<commons-io.version>2.13.0</commons-io.version>
|
||||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
<commons-validator.version>1.7</commons-validator.version>
|
||||
|
||||
<dhp.guava.version>14.0.1</dhp.guava.version>
|
||||
<solr.version>8.11.0</solr.version>
|
||||
<sparksolr.version>4.0.4</sparksolr.version>
|
||||
<dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
|
||||
<dhp.jackson.version>2.15.2</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
|
||||
<log4j.version>2.20.0</log4j.version>
|
||||
<json4s.version>3.7.0-M11</json4s.version>
|
||||
<javassist.version>3.25.0-GA</javassist.version>
|
||||
<okhttp.version>4.10.0</okhttp.version>
|
||||
<org.slf4j.version>2.0.7</org.slf4j.version>
|
||||
<reflections.version>0.10.2</reflections.version>
|
||||
<zookeeper.version>3.6.3</zookeeper.version>
|
||||
</properties>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>java11</id>
|
||||
<activation>
|
||||
<jdk>17</jdk>
|
||||
<jdk>[11</jdk>
|
||||
</activation>
|
||||
<build>
|
||||
<pluginManagement>
|
||||
|
@ -1031,6 +1096,7 @@
|
|||
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
|
||||
</argLine>
|
||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||
<trimStackTrace>false</trimStackTrace>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
|
Loading…
Reference in New Issue