Add profiles for different spark versions: spark-24, spark-34, spark-35

This commit is contained in:
Giambattista Bloisi 2023-09-21 14:23:37 +02:00 committed by Giambattista Bloisi
parent 52495f2cd2
commit 613ec5ffce
12 changed files with 245 additions and 32 deletions

View File

@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class PacePerson.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt"))); "/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }

View File

@ -24,7 +24,7 @@
<executions> <executions>
<execution> <execution>
<id>scala-compile-first</id> <id>scala-compile-first</id>
<phase>initialize</phase> <phase>process-resources</phase>
<goals> <goals>
<goal>add-source</goal> <goal>add-source</goal>
<goal>compile</goal> <goal>compile</goal>
@ -95,4 +95,90 @@
</dependency> </dependency>
</dependencies> </dependencies>
<profiles>
<profile>
<id>spark-24</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-34</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-35</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project> </project>

View File

@ -2,11 +2,10 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath} import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.MapDocumentUtil import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@ -48,8 +47,8 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
df.map(r => rowFromJson(r))(RowEncoder(schema)) df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
} }
def rowFromJson(json: String): Row = { def rowFromJson(json: String): Row = {

View File

@ -18,7 +18,6 @@ package eu.dnetlib.pace.util;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* Diff Match and Patch * Diff Match and Patch
* Copyright 2018 The diff-match-patch Authors. * Copyright 2018 The diff-match-patch Authors.

View File

@ -0,0 +1,12 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
RowEncoder(schema)
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
ExpressionEncoder(schema)
}
}

View File

@ -155,7 +155,8 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString( IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream( SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
),Charset.defaultCharset() ),
Charset.defaultCharset()
) )
) )
parser.parseArgument(args) parser.parseArgument(args)
@ -198,7 +199,7 @@ object SparkCreateBaselineDataFrame {
val ds: Dataset[PMArticle] = spark.createDataset( val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz")) k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => { .flatMap(i => {
val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes())) val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
}) })
) )

View File

@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -119,7 +119,9 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI", workingDir.toString() + "/COCI",
"-outputPath", "-outputPath",
workingDir.toString() + "/COCI_json/", workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4;input5" "-inputFile", "input1;input2;input3;input4;input5",
"-format",
"COCI"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

View File

@ -162,6 +162,18 @@
<artifactId>antlr4-runtime</artifactId> <artifactId>antlr4-runtime</artifactId>
<groupId>org.antlr</groupId> <groupId>org.antlr</groupId>
</exclusion> </exclusion>
<exclusion>
<artifactId>woodstox-core</artifactId>
<groupId>com.fasterxml.woodstox</groupId>
</exclusion>
<exclusion>
<artifactId>log4j</artifactId>
<groupId>*</groupId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency> <dependency>
@ -210,7 +222,7 @@
<profiles> <profiles>
<profile> <profile>
<id>scala-2.11</id> <id>spark-24</id>
<activation> <activation>
<activeByDefault>true</activeByDefault> <activeByDefault>true</activeByDefault>
</activation> </activation>
@ -240,7 +252,7 @@
</profile> </profile>
<profile> <profile>
<id>scala-2.12</id> <id>spark-34</id>
<build> <build>
<plugins> <plugins>
@ -266,6 +278,32 @@
</build> </build>
</profile> </profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/sparksolr-4</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles> </profiles>
</project> </project>

View File

@ -17,6 +17,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -117,7 +118,7 @@ public class PrepareSWHActionsets {
.map( .map(
(MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class), (MapFunction<String, Software>) t -> OBJECT_MAPPER.readValue(t, Software.class),
Encoders.bean(Software.class)) Encoders.bean(Software.class))
.filter(t -> t.getCodeRepositoryUrl() != null) .filter((FilterFunction<Software>) t -> t.getCodeRepositoryUrl() != null)
.select(col("id"), col("codeRepositoryUrl.value").as("repoUrl")); .select(col("id"), col("codeRepositoryUrl.value").as("repoUrl"));
} }

94
pom.xml
View File

@ -174,7 +174,7 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>${dhp-schemas.artifact}</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${dhp-schemas.version}</version> <version>${dhp-schemas.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -233,6 +233,13 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${org.slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency> <dependency>
<groupId>org.slf4j</groupId> <groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId> <artifactId>jcl-over-slf4j</artifactId>
@ -240,6 +247,28 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<!-- API bridge between log4j 1 and 2 -->
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
@ -381,7 +410,7 @@
<dependency> <dependency>
<groupId>org.apache.zookeeper</groupId> <groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId> <artifactId>zookeeper</artifactId>
<version>3.4.11</version> <version>${zookeeper.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -713,6 +742,7 @@
<version>3.0.0-M4</version> <version>3.0.0-M4</version>
<configuration> <configuration>
<redirectTestOutputToFile>true</redirectTestOutputToFile> <redirectTestOutputToFile>true</redirectTestOutputToFile>
<trimStackTrace>false</trimStackTrace>
</configuration> </configuration>
</plugin> </plugin>
<plugin> <plugin>
@ -782,7 +812,7 @@
<plugin> <plugin>
<groupId>net.revelc.code</groupId> <groupId>net.revelc.code</groupId>
<artifactId>impsort-maven-plugin</artifactId> <artifactId>impsort-maven-plugin</artifactId>
<version>1.4.1</version> <version>1.6.2</version>
<configuration> <configuration>
<groups>java.,javax.,org.,com.</groups> <groups>java.,javax.,org.,com.</groups>
<staticGroups>java,*</staticGroups> <staticGroups>java,*</staticGroups>
@ -918,8 +948,6 @@
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version> <net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
<!-- dependency versions --> <!-- dependency versions -->
<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
<apache.poi.version>4.1.2</apache.poi.version> <apache.poi.version>4.1.2</apache.poi.version>
<cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version> <cnr-rmi-api.version>[2.6.1]</cnr-rmi-api.version>
<common.compress.version>1.20</common.compress.version> <common.compress.version>1.20</common.compress.version>
@ -932,7 +960,7 @@
<commons.logging.version>1.1.3</commons.logging.version> <commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version> <commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version> <dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[3.17.1]</dhp-schemas.version> <dhp-schemas.version>4.17.2</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version> <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version> <dhp.guava.version>11.0.2</dhp.guava.version>
@ -945,6 +973,7 @@
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
<google.gson.version>2.2.2</google.gson.version> <google.gson.version>2.2.2</google.gson.version>
<log4j.version>1.2.17</log4j.version>
<javassist.version>3.19.0-GA</javassist.version> <javassist.version>3.19.0-GA</javassist.version>
<json4s.version>3.5.3</json4s.version> <json4s.version>3.5.3</json4s.version>
<jsonschemagenerator.version>4.13.0</jsonschemagenerator.version> <jsonschemagenerator.version>4.13.0</jsonschemagenerator.version>
@ -960,12 +989,13 @@
<sparksolr.version>3.6.0</sparksolr.version> <sparksolr.version>3.6.0</sparksolr.version>
<unidecode.version>0.0.7</unidecode.version> <unidecode.version>0.0.7</unidecode.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<zookeeper.version>3.4.6</zookeeper.version>
</properties> </properties>
<!-- Build with scala 12 and Spark 3.4 --> <!-- Build with scala 12 and Spark 3.4 -->
<profiles> <profiles>
<profile> <profile>
<id>scala-2.12</id> <id>spark-34</id>
<properties> <properties>
<scala.binary.version>2.12</scala.binary.version> <scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.18</scala.version> <scala.version>2.12.18</scala.version>
@ -988,25 +1018,60 @@
<dhp.guava.version>14.0.1</dhp.guava.version> <dhp.guava.version>14.0.1</dhp.guava.version>
<solr.version>8.11.0</solr.version> <solr.version>8.11.0</solr.version>
<sparksolr.version>4.0.4</sparksolr.version> <sparksolr.version>4.0.4</sparksolr.version>
<dhp.spark.version>3.4.2.openaire-SNAPSHOT</dhp.spark.version> <dhp.spark.version>3.4.2.openaire</dhp.spark.version>
<dhp.jackson.version>2.14.2</dhp.jackson.version> <dhp.jackson.version>2.14.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version> <dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<log4j.version>2.19.0</log4j.version>
<json4s.version>3.7.0-M11</json4s.version> <json4s.version>3.7.0-M11</json4s.version>
<javassist.version>3.25.0-GA</javassist.version> <javassist.version>3.25.0-GA</javassist.version>
<okhttp.version>4.10.0</okhttp.version> <okhttp.version>4.10.0</okhttp.version>
<org.slf4j.version>2.0.6</org.slf4j.version> <org.slf4j.version>2.0.6</org.slf4j.version>
<reflections.version>0.10.2</reflections.version> <reflections.version>0.10.2</reflections.version>
<!-- <zookeeper.version>3.6.3</zookeeper.version>
<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
-->
</properties> </properties>
</profile> </profile>
<profile> <profile>
<id>java17</id> <id>spark-35</id>
<properties>
<scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.18</scala.version>
<scala-xml.version>1.3.0</scala-xml.version>
<!-- plugin versions -->
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
<!-- dependencies -->
<common.compress.version>1.23.0</common.compress.version>
<common.csv.version>1.8</common.csv.version>
<common.text.version>1.10.0</common.text.version>
<commons-beanutils.version>1.9.4</commons-beanutils.version>
<commons-codec.version>1.16.0</commons-codec.version>
<commons.collections.version>3.2.2</commons.collections.version>
<commons-io.version>2.13.0</commons-io.version>
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dhp.guava.version>14.0.1</dhp.guava.version>
<solr.version>8.11.0</solr.version>
<sparksolr.version>4.0.4</sparksolr.version>
<dhp.spark.version>3.5.1.openaire-SNAPSHOT</dhp.spark.version>
<dhp.jackson.version>2.15.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<log4j.version>2.20.0</log4j.version>
<json4s.version>3.7.0-M11</json4s.version>
<javassist.version>3.25.0-GA</javassist.version>
<okhttp.version>4.10.0</okhttp.version>
<org.slf4j.version>2.0.7</org.slf4j.version>
<reflections.version>0.10.2</reflections.version>
<zookeeper.version>3.6.3</zookeeper.version>
</properties>
</profile>
<profile>
<id>java11</id>
<activation> <activation>
<jdk>17</jdk> <jdk>[11</jdk>
</activation> </activation>
<build> <build>
<pluginManagement> <pluginManagement>
@ -1031,6 +1096,7 @@
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
</argLine> </argLine>
<redirectTestOutputToFile>true</redirectTestOutputToFile> <redirectTestOutputToFile>true</redirectTestOutputToFile>
<trimStackTrace>false</trimStackTrace>
</configuration> </configuration>
</plugin> </plugin>
</plugins> </plugins>