Refactor Dedup using Spark Dataframe API, initial support for scala 2.12 and Spark 3.4 #324

Merged
claudio.atzori merged 2 commits from dedup-with-dataframe-2 into beta 2023-07-25 10:17:18 +02:00
17 changed files with 133 additions and 59 deletions
Showing only changes of commit bb5b845e3c - Show all commits

View File

@ -52,6 +52,8 @@
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -81,11 +83,11 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
@ -159,7 +161,7 @@
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<artifactId>${dhp-schemas.artifact}</artifactId>
</dependency>
<dependency>

View File

@ -20,7 +20,7 @@
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -39,8 +39,9 @@
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
<addScalacArgs>-target:jvm-1.8</addScalacArgs>
</configuration>
</plugin>
</plugins>
@ -68,7 +69,6 @@
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
@ -89,17 +89,22 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,6 +1,24 @@
package eu.dnetlib.pace.util;
/*
* Diff Match and Patch
* Copyright 2018 The diff-match-patch Authors.
* https://github.com/google/diff-match-patch
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Diff Match and Patch
* Copyright 2018 The diff-match-patch Authors.

View File

@ -11,12 +11,12 @@
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>

View File

@ -38,6 +38,8 @@
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -54,11 +56,11 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
@ -75,6 +77,11 @@
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-xml_${scala.binary.version}</artifactId>
<version>${scala-xml.version}</version>
</dependency>
<dependency>
<groupId>xml-apis</groupId>

View File

@ -16,11 +16,11 @@
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
</dependencies>

View File

@ -18,11 +18,11 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>

View File

@ -13,7 +13,7 @@
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -32,6 +32,8 @@
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -53,30 +55,35 @@
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-java8-compat_2.11</artifactId>
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_2.11</artifactId>
<version>2.8.0</version>
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
</dependency>
<dependency>

View File

@ -33,6 +33,8 @@
</execution>
</executions>
<configuration>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -70,12 +72,12 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>

View File

@ -12,11 +12,11 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
@ -27,7 +27,7 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>

View File

@ -14,7 +14,7 @@
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -37,6 +37,8 @@
<arg>-Xmax-classfile-name</arg>
<arg>200</arg>
</args>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -64,15 +66,15 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
@ -125,7 +127,7 @@
</dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.11</artifactId>
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
</dependency>

View File

@ -14,7 +14,7 @@
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -37,6 +37,8 @@
<arg>-Xmax-classfile-name</arg>
<arg>200</arg>
</args>
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
@ -48,11 +50,11 @@
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>

View File

@ -10,11 +10,11 @@
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
</dependencies>
<build>

View File

@ -10,11 +10,11 @@
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
</dependencies>
<build>

View File

@ -46,13 +46,11 @@
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>

View File

@ -46,13 +46,11 @@
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>

53
pom.xml
View File

@ -142,7 +142,7 @@
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<artifactId>${dhp-schemas.artifact}</artifactId>
<version>${dhp-schemas.version}</version>
</dependency>
<dependency>
@ -171,25 +171,25 @@
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${dhp.spark.version}</version>
<scope>test</scope>
</dependency>
@ -295,7 +295,7 @@
<dependency>
<groupId>com.lucidworks.spark</groupId>
<artifactId>spark-solr</artifactId>
<version>3.6.0</version>
<version>${sparksolr.version}</version>
<exclusions>
<exclusion>
<artifactId>*</artifactId>
@ -518,7 +518,7 @@
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.11</artifactId>
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
<version>${json4s.version}</version>
</dependency>
@ -610,7 +610,11 @@
<testOutputDirectory>target/test-classes</testOutputDirectory>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-plugin-plugin</artifactId>
<version>3.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-project-info-reports-plugin</artifactId>
@ -694,7 +698,7 @@
</plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
<version>1.0.1640073709.733712b</version>
<dependencies>
<dependency>
@ -751,7 +755,7 @@
</plugin>
<plugin>
<groupId>org.antipathy</groupId>
<artifactId>mvn-scalafmt_2.11</artifactId>
<artifactId>mvn-scalafmt_${scala.binary.version}</artifactId>
<configuration>
<configLocation>https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf</configLocation>
<skipTestSources>false</skipTestSources>
@ -860,12 +864,16 @@
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.hadoop.version>2.6.0-${dhp.cdh.version}</dhp.hadoop.version>
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp-schemas.artifact>dhp-schemas</dhp-schemas.artifact>
<sparksolr.version>3.6.0</sparksolr.version>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.site.skip>true</dhp.site.skip>
<dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<scala-xml.version>1.3.0</scala-xml.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
@ -889,4 +897,29 @@
<commons.logging.version>1.1.3</commons.logging.version>
<commons.collections.version>3.2.1</commons.collections.version>
</properties>
<!-- Build with scala 12 and Spark 3.4 -->
<profiles>
<profile>
<id>scala-2.12</id>
<properties>
<scala.binary.version>2.12</scala.binary.version>
<scala.version>2.12.18</scala.version>
<!-- scala-xml.version>2.1.0</scala-xml.version -->
<sparksolr.version>4.0.2</sparksolr.version>
<dhp.spark.version>3.4.1</dhp.spark.version>
<dhp.jackson.version>2.14.2</dhp.jackson.version>
<dhp.commons.lang.version>3.12.0</dhp.commons.lang.version>
<json4s.version>3.7.0-M11</json4s.version>
<net.alchim31.maven.version>4.8.1</net.alchim31.maven.version>
<!--
<dhp-schemas.artifact>dhp-schemas_${scala.binary.version}</dhp-schemas.artifact>
<dhp-schemas.version>3.17.2-SNAPSHOT</dhp-schemas.version>
-->
</properties>
</profile>
</profiles>
</project>