implementation of the lda tuning workflow

2023-04-03 09:46:55 +02:00 · 2023-04-03 09:46:55 +02:00 · 2355f8f65e
parent 7954c27773
commit 2355f8f65e
16 changed files with 2040 additions and 95 deletions
--- a/dnet-and-test/job-override.properties
+++ b/dnet-and-test/job-override.properties
@ -0,0 +1,10 @@
+workingPath = /tmp/lda_working_dir
+numPartitions = 1000
+entitiesPath = /tmp/publications_with_pid_pubmed
+inputFieldJPath = $.description[0].value
+vocabularyPath = /tmp/lda_working_dir/essential_science_vocabulary
+vocabularyType = file
+trainRatio = 0.8
+numTopics = 5,10,15,20,25,30,35,40,45,50
+maxIterations = 200
+outputModelPath = /tmp/lda_working_dir/bestLdaModel
--- a/dnet-and-test/pom.xml
+++ b/dnet-and-test/pom.xml
@ -1,20 +1,570 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
    <modelVersion>4.0.0</modelVersion>
+
    <parent>
        <groupId>eu.dnetlib</groupId>
        <artifactId>dnet-and</artifactId>
        <version>1.0.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
    </parent>

    <artifactId>dnet-and-test</artifactId>
+    <packaging>jar</packaging>

-    <properties>
-        <maven.compiler.source>8</maven.compiler.source>
-        <maven.compiler.target>8</maven.compiler.target>
-        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    </properties>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <version>2.7</version>
+                <configuration>
+                    <skip>true</skip>
+                </configuration>
+            </plugin>

+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                    <includes>
+                        <include>**/*.java</include>
+                    </includes>
+                </configuration>
+            </plugin>
+
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>4.0.1</version>
+                <executions>
+                    <execution>
+                        <id>scala-compile-first</id>
+                        <phase>initialize</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>scala-test-compile</id>
+                        <phase>process-test-resources</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                </configuration>
+            </plugin>
+
+
+        </plugins>
+    </build>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>edu.cmu</groupId>
+            <artifactId>secondstring</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>eu.dnetlib</groupId>
+            <artifactId>dnet-feature-extraction</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <!--Spark Dependencies-->
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-graphx_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-mllib_2.11</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-xml</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.module</groupId>
+            <artifactId>jackson-module-jsonSchema</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>com.jayway.jsonpath</groupId>
+            <artifactId>json-path</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-junit-jupiter</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+    </dependencies>
+
+    <profiles>
+        <profile>
+            <id>oozie-package</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-enforcer-plugin</artifactId>
+                        <version>1.4.1</version>
+                        <executions>
+                            <execution>
+                                <id>enforce-connection-properties-file-existence</id>
+                                <phase>initialize</phase>
+                                <goals>
+                                    <goal>enforce</goal>
+                                </goals>
+                                <configuration>
+                                    <rules>
+                                        <requireFilesExist>
+                                            <files>
+                                                <file>${dhpConnectionProperties}</file>
+                                            </files>
+                                            <message>
+                                                The file with connection properties could not be found. Please, create the ${dhpConnectionProperties} file or set the location to another already created file by using
+                                                -DdhpConnectionProperties property.
+                                            </message>
+                                        </requireFilesExist>
+                                    </rules>
+                                    <fail>true</fail>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-dependency-plugin</artifactId>
+                        <executions>
+                            <execution>
+                                <id>copy dependencies</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>copy-dependencies</goal>
+                                </goals>
+                                <configuration>
+                                    <includeScope>${oozie.package.dependencies.include.scope}</includeScope>
+                                    <excludeScope>${oozie.package.dependencies.exclude.scope}</excludeScope>
+                                    <silent>true</silent>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <!-- Plugin originally defined in attach-test-resources It was moved here to ensure that it will execute before priming -->
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-jar-plugin</artifactId>
+                        <executions>
+                            <execution>
+                                <id>attach-test-resources-package</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>test-jar</goal>
+                                </goals>
+                                <configuration>
+                                    <skip>${oozie.package.skip.test.jar}</skip>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+
+                    <plugin>
+                        <groupId>eu.dnetlib.primer</groupId>
+                        <artifactId>primer-maven-plugin</artifactId>
+                        <version>1.2.0</version>
+                        <executions>
+                            <execution>
+                                <id>priming</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>prime</goal>
+                                </goals>
+                                <configuration>
+                                    <classProviderFiles>
+                                        <classProviderFile>${project.build.directory}/dependency/*.jar</classProviderFile>
+                                        <classProviderFile>${project.build.directory}/*-tests.jar</classProviderFile>
+                                        <classProviderFile>${project.build.directory}/classes</classProviderFile>
+                                    </classProviderFiles>
+                                    <coansysPackageDir>${project.build.directory}/dependency</coansysPackageDir>
+                                    <destination>${project.build.directory}/${primed.dir}</destination>
+                                    <classpath>${workflow.source.dir}</classpath>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <!-- reading job.properties to use them in .sh scripts -->
+                    <plugin>
+                        <groupId>org.kuali.maven.plugins</groupId>
+                        <artifactId>properties-maven-plugin</artifactId>
+                        <version>${properties.maven.plugin.version}</version>
+                        <dependencies>
+                            <dependency>
+                                <groupId>eu.dnetlib</groupId>
+                                <artifactId>dhp-build-assembly-resources</artifactId>
+                                <version>${project.version}</version>
+                                <!-- contains project-default.properties -->
+                            </dependency>
+                        </dependencies>
+                        <executions>
+                            <execution>
+                                <id>reading-dhp-properties</id>
+                                <phase>initialize</phase>
+                                <goals>
+                                    <goal>read-project-properties</goal>
+                                </goals>
+                                <configuration>
+                                    <locations>
+                                        <location>${dhpConnectionProperties}</location>
+                                    </locations>
+                                    <quiet>false</quiet>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>read-default-properties</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>read-project-properties</goal>
+                                </goals>
+                                <configuration>
+                                    <locations>
+                                        <location>classpath:project-default.properties</location>
+                                    </locations>
+                                    <quiet>true</quiet>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>read-job-properties</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>read-project-properties</goal>
+                                </goals>
+                                <configuration>
+                                    <locations>
+                                        <param>${project.build.directory}/${primed.dir}/job.properties</param>
+                                        <param>job-override.properties</param>
+                                    </locations>
+                                    <quiet>true</quiet>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>eu.dnetlib</groupId>
+                        <artifactId>dhp-build-properties-maven-plugin</artifactId>
+                        <version>${project.version}</version>
+                        <executions>
+                            <execution>
+                                <phase>validate</phase>
+                                <goals>
+                                    <goal>generate-properties</goal>
+                                    <!-- generates sandboxName based on workflow.source.dir when not specified as commandline parameter -->
+                                </goals>
+                                <configuration>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>write-job-properties</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>write-project-properties</goal>
+                                </goals>
+                                <configuration>
+                                    <outputFile>target/${oozie.package.file.name}/job.properties</outputFile>
+                                    <!-- notice: dots are not allowed for job.properties! -->
+                                    <include>nameNode,jobTracker,queueName,importerQueueName,oozieLauncherQueueName,
+                                        workingDir,oozieTopWfApplicationPath,oozieServiceLoc,
+                                        sparkDriverMemory,sparkExecutorMemory,sparkExecutorCores,
+                                        oozie.wf.application.path,projectVersion,oozie.use.system.libpath,
+                                        oozieActionShareLibForSpark1,spark1YarnHistoryServerAddress,spark1EventLogDir,
+                                        oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir,
+                                        sparkSqlWarehouseDir
+                                    </include>
+                                    <includeSystemProperties>true</includeSystemProperties>
+                                    <includePropertyKeysFromFiles>
+                                        <!-- <param>${workflow.source.dir}/job.properties</param> -->
+                                        <param>${project.build.directory}/${primed.dir}/job.properties</param>
+                                        <param>job-override.properties</param>
+
+                                    </includePropertyKeysFromFiles>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>pl.project13.maven</groupId>
+                        <artifactId>git-commit-id-plugin</artifactId>
+                        <version>2.1.11</version>
+                        <executions>
+                            <execution>
+                                <goals>
+                                    <goal>revision</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                        <configuration>
+                            <verbose>true</verbose>
+                            <dateFormat>yyyy-MM-dd'T'HH:mm:ssZ</dateFormat>
+                            <generateGitPropertiesFile>true</generateGitPropertiesFile>
+                            <generateGitPropertiesFilename>target/${oozie.package.file.name}/${oozieAppDir}/version.properties</generateGitPropertiesFilename>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-assembly-plugin</artifactId>
+                        <version>3.0.0</version>
+                        <dependencies>
+                            <dependency>
+                                <groupId>eu.dnetlib</groupId>
+                                <artifactId>dhp-build-assembly-resources</artifactId>
+                                <version>${project.version}</version>
+                            </dependency>
+                        </dependencies>
+                        <executions>
+                            <execution>
+                                <id>assembly-oozie-installer</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>single</goal>
+                                </goals>
+                                <configuration>
+                                    <appendAssemblyId>false</appendAssemblyId>
+                                    <finalName>${oozie.package.file.name}_shell_scripts</finalName>
+                                    <descriptorRefs>
+                                        <descriptorRef>oozie-installer</descriptorRef>
+                                    </descriptorRefs>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+
+
+                    <plugin>
+                        <!-- this plugin prepares oozie installer package-->
+
+                        <artifactId>maven-antrun-plugin</artifactId>
+                        <executions>
+                            <!-- extracting shared resources phase -->
+                            <execution>
+                                <id>installer-copy-custom</id>
+                                <phase>process-resources</phase>
+                                <goals>
+                                    <goal>run</goal>
+                                </goals>
+                                <configuration>
+                                    <tasks>
+                                        <property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
+                                        <unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
+                                    </tasks>
+                                </configuration>
+                            </execution>
+                            <!-- packaging phase -->
+                            <execution>
+                                <phase>package</phase>
+                                <configuration>
+                                    <tasks>
+                                        <!-- copying workflow resources -->
+                                        <mkdir dir="target/${oozie.package.file.name}" />
+                                        <mkdir dir="target/${oozie.package.file.name}/${oozieAppDir}" />
+                                        <copy todir="target/${oozie.package.file.name}/${oozieAppDir}">
+                                            <!-- <fileset dir="${workflow.source.dir}/${oozieAppDir}" /> replacing with primed dir location -->
+                                            <fileset dir="target/${primed.dir}/${oozieAppDir}" />
+                                        </copy>
+                                        <!-- copying all jars to oozie lib directory -->
+                                        <mkdir dir="target/${oozie.package.file.name}/${oozieAppDir}/lib" />
+                                        <copy todir="target/${oozie.package.file.name}/${oozieAppDir}/lib">
+                                            <fileset dir="${project.build.directory}/dependency" />
+                                        </copy>
+                                        <!-- copying current module lib -->
+                                        <copy todir="target/${oozie.package.file.name}/${oozieAppDir}/lib">
+                                            <fileset dir="${project.build.directory}">
+                                                <include name="*.jar" />
+                                            </fileset>
+                                        </copy>
+
+
+                                        <fixcrlf srcdir="target/${oozie.package.file.name}/${oozieAppDir}/" encoding="UTF-8" outputencoding="UTF-8" includes="**/*.sh,**/*.json,**/*.py,**/*.sql" eol="lf" />
+
+
+                                        <!-- creating tar.gz package -->
+                                        <tar destfile="target/${oozie.package.file.name}.tar.gz" compression="gzip" longfile="gnu">
+                                            <tarfileset dir="target/${oozie.package.file.name}" />
+                                            <tarfileset dir="target/${oozie.package.file.name}_shell_scripts" filemode="0755">
+                                                <include name="**/*.sh" />
+                                            </tarfileset>
+                                            <tarfileset dir="target/${oozie.package.file.name}_shell_scripts" filemode="0644">
+                                                <exclude name="**/*.sh" />
+                                            </tarfileset>
+                                        </tar>
+                                        <!-- cleanup -->
+                                        <delete dir="target/${oozie.package.file.name}" />
+                                        <delete dir="target/${oozie.package.file.name}_shell_scripts" />
+                                    </tasks>
+                                </configuration>
+                                <goals>
+                                    <goal>run</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <profile>
+            <id>deploy</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>exec-maven-plugin</artifactId>
+                        <version>1.5.0</version>
+                        <executions>
+                            <execution>
+                                <id>create-target-dir</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                                <configuration>
+                                    <executable>ssh</executable>
+                                    <arguments>
+                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
+                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument>
+                                    </arguments>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>upload-oozie-package</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                                <configuration>
+                                    <executable>scp</executable>
+                                    <arguments>
+                                        <argument>-P ${dhp.hadoop.frontend.port.ssh}</argument>
+                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <argument>target/${oozie.package.file.name}.tar.gz</argument>
+                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument>
+                                    </arguments>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>extract-and-upload-to-hdfs</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                                <configuration>
+                                    <executable>ssh</executable>
+                                    <!-- <outputFile>target/redirected_upload.log</outputFile> -->
+                                    <arguments>
+                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
+                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
+                                        <argument>tar -zxf oozie-package.tar.gz; </argument>
+                                        <argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument>
+                                        <argument>./upload_workflow.sh</argument>
+                                    </arguments>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <profile>
+            <id>run</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>exec-maven-plugin</artifactId>
+                        <version>1.5.0</version>
+                        <executions>
+                            <execution>
+                                <id>run-job</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                                <configuration>
+                                    <executable>ssh</executable>
+                                    <!-- this file will be used by test verification profile reading job identifier -->
+                                    <outputFile>${oozie.execution.log.file.location}</outputFile>
+                                    <arguments>
+                                        <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
+                                        <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
+                                        <argument>-o StrictHostKeyChecking=no</argument>
+                                        <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
+                                        <argument>./run_workflow.sh</argument>
+                                    </arguments>
+                                </configuration>
+                            </execution>
+                            <execution>
+                                <id>show-run-log-on-stdout</id>
+                                <phase>package</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                                <configuration>
+                                    <executable>cat</executable>
+                                    <arguments>
+                                        <argument>${oozie.execution.log.file.location}</argument>
+                                    </arguments>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
 </project>
--- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/AbstractSparkJob.java
@ -2,12 +2,9 @@ package eu.dnetlib.jobs;

 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.utils.Utility;
 import eu.dnetlib.support.ArgumentApplicationParser;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
@ -49,12 +46,6 @@ public abstract class AbstractSparkJob implements Serializable {
        dataset.write().option("compression", "gzip").mode(mode).json(outPath);
    }

-    protected static DedupConfig loadDedupConfig(String dedupConfPath) throws IOException {
-        return DedupConfig.load(
-            readFileFromHDFS(dedupConfPath)
-        );
-    }
-
    protected static String readFileFromHDFS(String filePath) throws IOException {

        Path path=new Path(filePath);
--- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCountVectorizer.java
@ -1,2 +1,71 @@
-package eu.dnetlib.jobs;public class SparkCountVectorizer {
+package eu.dnetlib.jobs;
+
+import eu.dnetlib.featureextraction.FeatureTransformer;
+import eu.dnetlib.support.ArgumentApplicationParser;
+import org.apache.hadoop.fs.shell.Count;
+import org.apache.spark.SparkConf;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Optional;
+
+public class SparkCountVectorizer extends AbstractSparkJob{
+
+    private static final Logger log = LoggerFactory.getLogger(SparkCountVectorizer.class);
+
+    public SparkCountVectorizer(ArgumentApplicationParser parser, SparkSession spark) {
+        super(parser, spark);
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class)
+        );
+
+        parser.parseArgument(args);
+
+        SparkConf conf = new SparkConf();
+
+        new SparkCountVectorizer(
+                parser,
+                getSparkSession(conf)
+        ).run();
+    }
+
+    @Override
+    public void run() throws IOException {
+
+        // read oozie parameters
+        final String workingPath = parser.get("workingPath");
+        final String vocabularyPath = parser.get("vocabularyPath");
+        final int numPartitions = Optional
+                .ofNullable(parser.get("numPartitions"))
+                .map(Integer::valueOf)
+                .orElse(NUM_PARTITIONS);
+
+        log.info("workingPath:     '{}'", workingPath);
+        log.info("vocabularyPath:  '{}'", vocabularyPath);
+        log.info("numPartitions:   '{}'", numPartitions);
+
+        //read input tokens
+        Dataset<Row> inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions);
+        //read vocabulary
+        CountVectorizerModel vocabulary = FeatureTransformer.loadVocabulary(vocabularyPath);
+
+        Dataset<Row> countVectorizedData = FeatureTransformer.countVectorizeData(inputTokensDS, vocabulary);
+
+        countVectorizedData
+                .write()
+                .mode(SaveMode.Overwrite)
+                .save(workingPath + "/countVectorized");
+
+    }
+
 }
--- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkCreateVocabulary.java
@ -1,2 +1,94 @@
-package eu.dnetlib.jobs;public class SparkCreateVocabulary {
+package eu.dnetlib.jobs;
+
+import eu.dnetlib.featureextraction.FeatureTransformer;
+import eu.dnetlib.support.ArgumentApplicationParser;
+import org.apache.spark.SparkConf;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.Paths;
+import java.util.Optional;
+
+public class SparkCreateVocabulary extends AbstractSparkJob{
+
+    final static int VOCAB_SIZE = 1<<18;
+    final static double MIN_DF = 0.1;
+    final static double MIN_TF = 1;
+
+    private static final Logger log = LoggerFactory.getLogger(SparkCreateVocabulary.class);
+
+    public SparkCreateVocabulary(ArgumentApplicationParser parser, SparkSession spark) {
+        super(parser, spark);
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class)
+        );
+
+        parser.parseArgument(args);
+
+        SparkConf conf = new SparkConf();
+
+        new SparkCreateVocabulary(
+                parser,
+                getSparkSession(conf)
+        ).run();
+    }
+
+    @Override
+    public void run() throws IOException {
+
+        // read oozie parameters
+        final String workingPath = parser.get("workingPath");
+        final String vocabularyPath = parser.get("vocabularyPath");
+        final String vocabularyType = parser.get("vocabularyType");  //from file or from tokens
+        final double minDF = Optional
+                .ofNullable(parser.get("minDF"))
+                .map(Double::valueOf)
+                .orElse(MIN_DF);
+        final double minTF = Optional
+                .ofNullable(parser.get("minTF"))
+                .map(Double::valueOf)
+                .orElse(MIN_TF);
+        final int numPartitions = Optional
+                .ofNullable(parser.get("numPartitions"))
+                .map(Integer::valueOf)
+                .orElse(NUM_PARTITIONS);
+        final int vocabSize = Optional
+                .ofNullable(parser.get("vocabSize"))
+                .map(Integer::valueOf)
+                .orElse(VOCAB_SIZE);
+
+        log.info("workingPath:     '{}'", workingPath);
+        log.info("vocabularyPath:  '{}'", vocabularyPath);
+        log.info("vocabularyType:  '{}'", vocabularyType);
+        log.info("minDF:           '{}'", minDF);
+        log.info("minTF:           '{}'", minTF);
+        log.info("vocabSize:       '{}'", vocabSize);
+
+        Dataset<Row> inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions);
+        CountVectorizerModel vocabulary;
+        if (vocabularyType.equals("file")) {
+            try {
+                vocabulary = FeatureTransformer.createVocabularyFromFile(Paths
+                        .get(getClass().getResource("/eu/dnetlib/jobs/support/vocabulary_words.txt").toURI())
+                        .toFile()
+                        .getAbsolutePath());
+            } catch (URISyntaxException e) {
+                throw new RuntimeException(e);
+            }
+        }
+        else {
+            vocabulary = FeatureTransformer.createVocabularyFromTokens(inputTokensDS, minDF, minTF, vocabSize);
+        }
+
+        vocabulary.write().overwrite().save(vocabularyPath);
+    }
+
 }
--- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkLDATuning.java
@ -1,11 +1,90 @@
 package eu.dnetlib.jobs;

-import java.io.IOException;
+import eu.dnetlib.featureextraction.FeatureTransformer;
+import eu.dnetlib.featureextraction.util.Utilities;
+import eu.dnetlib.support.ArgumentApplicationParser;
+import org.apache.spark.SparkConf;
+import org.apache.spark.ml.clustering.LDAModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Tuple2;

-public class SparkLDAHyperparametersTuning extends AbstractSparkJob{
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Stream;
+
+public class SparkLDATuning extends AbstractSparkJob{
+
+    private static final Logger log = LoggerFactory.getLogger(SparkLDATuning.class);
+
+    public SparkLDATuning(ArgumentApplicationParser parser, SparkSession spark) {
+        super(parser, spark);
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class)
+        );
+
+        parser.parseArgument(args);
+
+        SparkConf conf = new SparkConf();
+
+        new SparkLDATuning(
+                parser,
+                getSparkSession(conf)
+        ).run();
+    }

    @Override
-    void run() throws IOException {
+    public void run() throws IOException {
+
+        // read oozie parameters
+        final String workingPath = parser.get("workingPath");
+        final int maxIterations = Integer.parseInt(parser.get("maxIterations"));
+        final double trainRatio = Double.parseDouble(parser.get("trainRatio"));
+        int[] numTopics = Arrays.stream(parser.get("numTopics").split(",")).mapToInt(s -> Integer.parseInt(s)).toArray();
+        final String outputModelPath = parser.get("outputModelPath");
+        final int numPartitions = Optional
+                .ofNullable(parser.get("numPartitions"))
+                .map(Integer::valueOf)
+                .orElse(NUM_PARTITIONS);
+
+        log.info("workingPath:     '{}'", workingPath);
+        log.info("numPartitions:   '{}'", numPartitions);
+        log.info("maxIterations:   '{}'", maxIterations);
+        log.info("numTopics:       '{}'", numTopics.toString());
+        log.info("trainRatio:      '{}'", trainRatio);
+        log.info("outputModelPath: '{}'", outputModelPath);
+
+        Dataset<Row> inputFeaturesDS = spark.read().load(workingPath + "/countVectorized");
+        Map<Integer, Tuple2<LDAModel, Double>> ldaModels =
+                FeatureTransformer.ldaTuning(inputFeaturesDS, trainRatio, numTopics, maxIterations);
+
+        double bestPerplexity = 100L;
+        LDAModel bestModel = null;
+
+        List<String> stats = new ArrayList<>();
+        stats.add("k,perplexity,path");
+        for(Integer k: ldaModels.keySet()) {
+            //save LDAModel
+            ldaModels.get(k)._1().write().overwrite().save(workingPath + "/lda_model_k" + k);
+            //prepare line
+            stats.add(k + "," + ldaModels.get(k)._2() + "," + workingPath + "/lda_model_k" + k);
+
+            //pick the best model
+            bestModel = (ldaModels.get(k)._2() <= bestPerplexity)? ldaModels.get(k)._1() : bestModel;
+            bestPerplexity = Math.min(ldaModels.get(k)._2(), bestPerplexity);
+        }
+
+        bestModel.write().overwrite().save(outputModelPath);
+        Utilities.writeLinesToHDFSFile(stats, workingPath + "/perplexity_stats.csv");

    }
+
 }
+
--- a/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/jobs/SparkTokenizer.java
@ -1,2 +1,69 @@
-package eu.dnetlib.jobs;public class SparkTokenizer {
+package eu.dnetlib.jobs;
+
+import eu.dnetlib.featureextraction.FeatureTransformer;
+import eu.dnetlib.featureextraction.util.Utilities;
+import eu.dnetlib.support.ArgumentApplicationParser;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.ws.rs.core.Feature;
+import java.io.IOException;
+import java.util.Optional;
+
+public class SparkTokenizer extends AbstractSparkJob {
+
+    private static final Logger log = LoggerFactory.getLogger(SparkTokenizer.class);
+
+    public SparkTokenizer(ArgumentApplicationParser parser, SparkSession spark) {
+        super(parser, spark);
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(
+                readResource("/jobs/parameters/tokenizer_parameters.json", SparkTokenizer.class)
+        );
+
+        parser.parseArgument(args);
+
+        SparkConf conf = new SparkConf();
+
+        new SparkTokenizer(
+                parser,
+                getSparkSession(conf)
+        ).run();
+    }
+
+    @Override
+    public void run() throws IOException {
+        // read oozie parameters
+        final String entitiesPath = parser.get("entitiesPath");
+        final String workingPath = parser.get("workingPath");
+        final String inputFieldJPath = parser.get("inputFieldJPath");
+        final int numPartitions = Optional
+                .ofNullable(parser.get("numPartitions"))
+                .map(Integer::valueOf)
+                .orElse(NUM_PARTITIONS);
+
+        log.info("entitiesPath:   '{}'", entitiesPath);
+        log.info("workingPath:    '{}'", workingPath);
+        log.info("inputField:     '{}'", inputFieldJPath);
+        log.info("numPartitions:  '{}'", numPartitions);
+
+        JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext());
+        SQLContext sqlContext = SQLContext.getOrCreate(spark.sparkContext());
+
+        Dataset<Row> inputDS = Utilities.prepareDataset(sqlContext, context.textFile(entitiesPath).repartition(numPartitions), inputFieldJPath);
+
+        Dataset<Row> tokensDS = FeatureTransformer.tokenizeData(inputDS);
+
+        tokensDS
+                .write()
+                .mode(SaveMode.Overwrite)
+                .save(workingPath + "/tokens");
+    }
 }
--- a/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java
+++ b/dnet-and-test/src/main/java/eu/dnetlib/support/ArgumentApplicationParser.java
@ -68,7 +68,7 @@ public class ArgumentApplicationParser implements Serializable {
        GZIPOutputStream gzip = new GZIPOutputStream(out);
        gzip.write(value.getBytes());
        gzip.close();
-        return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
+        return Base64.getEncoder().encodeToString(out.toByteArray());
    }

    public void parseArgument(final String[] args) throws Exception {
--- a/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt
+++ b/dnet-and-test/src/main/resources/eu/dnetlib/jobs/support/vocabulary_words.txt
@ -0,0 +1,436 @@
+hypothesis
+experiment
+control
+model
+graph
+precision
+accuracy
+assumption
+procedure
+observation
+inference
+method
+framework
+data
+prediction
+quantitative
+qualitative
+bias
+constant
+variable
+chart
+trend
+empirical
+evidence
+cell
+chromosome
+cellulose
+chloroplast
+cytoplasm
+diffusion
+lysosome
+meiosis
+membrane
+mitochondrion
+mitosis
+nucleolus
+nucleus
+organelle
+osmosis
+permeable
+photosynthesis
+respiration
+ribosome
+vacuole
+amphibian
+arthropod
+bacteria
+cold-blooded
+domain
+eukaryote
+family
+fungus
+genus
+invertebrate
+kingdom
+mammal
+order
+phylum
+prokaryote
+reptile
+vertebrate
+virus
+blood
+warm-blooded
+annual
+bulb
+chlorophyll
+germinate
+germination
+leaf
+perennial
+phloem
+phototropism
+pollen
+pollinate
+root
+seed
+stamen
+stoma
+transpiration
+xylem
+circulation
+digestion
+digestive
+endocrine
+excretion
+homeostasis
+hormone
+immune
+immunize
+infection
+lymphatic
+metabolism
+nervous
+nutrition
+pathogen
+reproduction
+respiratory
+vaccination
+aorta
+artery
+brain
+capillary
+cardiac
+cartilage
+cerebellum
+cerebrum
+cranium
+epidermis
+esophagus
+femur
+gland
+hemoglobin
+involuntary
+joint
+ligament
+muscle
+nerve
+neuron
+organ
+plasma
+platelet
+skeleton
+sternum
+synapse
+tendon
+tissue
+vein
+ventricle
+vertebra
+voluntary
+autotrophic
+biome
+camouflage
+carnivore
+commensalism
+community
+competition
+consumer
+decomposer
+habitat
+herbivore
+heterotrophic
+host
+interdependent
+migration
+mutualism
+niche
+nutrient
+omnivore
+organism
+parasite
+parasitism
+population
+predator
+prey
+producer
+scavenger
+succession
+symbiosis
+adaptation
+allele
+clone
+dominant
+extinction
+gene
+genome
+genotype
+heredity
+heterozygous
+homologous
+homozygous
+hybrid
+inherit
+mutation
+natural
+selection
+offspring
+phenotype
+probability
+recessive
+species
+trait
+variation
+altitude
+core
+crust
+deposition
+elevation
+epoch
+equator
+era
+erosion
+fossil
+geology
+hydrosphere
+igneous
+lithosphere
+mantle
+metamorphic
+paleontology
+petrifaction
+prehistoric
+sedimentary
+sedimentation
+stratum
+tide
+aftershock
+canyon
+continent
+continental
+drift
+desert
+dormant
+earthquake
+epicenter
+eruption
+fault
+geyser
+glacier
+iceberg
+lava
+magma
+molten
+plate
+tectonics
+plateau
+ridge
+rift
+savanna
+seismic
+seismograph
+subduction
+tundra
+volcano
+watershed
+barometer
+blizzard
+climate
+change
+condensation
+convection
+current
+cyclone
+desertification
+drought
+evaporation
+front
+humidity
+hurricane
+meteorology
+monsoon
+precipitation
+pressure
+sleet
+temperature
+thermometer
+tornado
+tropical
+tsunami
+weather
+aquifer
+biodegradable
+biodiversity
+biomass
+biosphere
+conservation
+decay
+deforestation
+depletion
+ecology
+extraction
+fission
+fuel
+fracking
+geothermal
+global
+warming
+irrigation
+landfill
+mineral
+resource
+ozone
+pesticide
+petroleum
+pollutant
+pollution
+reclamation
+recycle
+renewable
+reservoir
+salinity
+sustainable
+turbine
+apogee
+asteroid
+astronomy
+atmosphere
+axis
+constellation
+comet
+corona
+eclipse
+elliptical
+galaxy
+luminosity
+lunar
+meteor
+meteorite
+nadir
+nebula
+observatory
+orbit
+perigee
+pulsar
+quasar
+solar
+stellar
+supernova
+vacuum
+wane
+zenith
+alloy
+anion
+atom
+bond
+cation
+compound
+density
+ductile
+electron
+element
+gas
+ion
+isotope
+liquid
+malleable
+mass
+metal
+metalloid
+molecule
+neutron
+nonmetal
+polar
+proton
+solid
+substance
+volume
+acid
+base
+catalyst
+concentration
+dissolve
+enzyme
+oxidation
+precipitate
+reactant
+reaction
+saturate
+solubility
+solute
+solution
+solvent
+substrate
+synthesis
+conduction
+endothermic
+energy
+entropy
+equilibrium
+exothermic
+heat
+insulation
+matter
+nuclear
+thermal
+acceleration
+axle
+centripetal
+deceleration
+force
+friction
+fulcrum
+gravity
+inclined
+inertia
+kinetic
+lever
+machine
+momentum
+motion
+potential
+power
+pulley
+screw
+speed
+tensile
+torque
+velocity
+wedge
+amplitude
+circuit
+compression
+crest
+diffraction
+emission
+frequency
+magnet
+medium
+particle
+period
+pole
+radiation
+rarefaction
+reflection
+refraction
+spectrum
+trough
+ultraviolet
+wavelength
+x-ray
+ray
+biology
+taxonomy
+plant
+ecosystem
+genetic
+evolution
+geologic
+feature
+environment
+space
+physic
+wave
+electricity
+magnetism
--- a/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json
+++ b/dnet-and-test/src/main/resources/jobs/parameters/countVectorizer_parameters.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "w",
+    "paramLongName": "workingPath",
+    "paramDescription": "location of the working directory",
+    "paramRequired": true
+  },
+  {
+    "paramName": "v",
+    "paramLongName": "vocabularyPath",
+    "paramDescription": "location to store the vocabulary",
+    "paramRequired": true
+  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of partitions for the similarity relations intermediate phases",
+    "paramRequired": false
+  }
+]
--- a/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json
+++ b/dnet-and-test/src/main/resources/jobs/parameters/createVocabulary_parameters.json
@ -0,0 +1,44 @@
+[
+  {
+    "paramName": "w",
+    "paramLongName": "workingPath",
+    "paramDescription": "location of the working directory",
+    "paramRequired": true
+  },
+  {
+    "paramName": "v",
+    "paramLongName": "vocabularyPath",
+    "paramDescription": "location to store the vocabulary",
+    "paramRequired": true
+  },
+  {
+    "paramName": "vt",
+    "paramLongName": "vocabularyType",
+    "paramDescription": "type of vocabulary: it could ben 'tokens' if generated with tokens or 'file' if generated from file of words",
+    "paramRequired": true
+  },
+  {
+    "paramName": "md",
+    "paramLongName": "minDF",
+    "paramDescription": "specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer greater than or equal to 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents",
+    "paramRequired": false
+  },
+  {
+    "paramName": "mt",
+    "paramLongName": "minTF",
+    "paramDescription": "filter to ignore rare words in a document. For each document, terms with frequency/count less than the given threshold are ignored. If this is an integer greater than or equal to 1, then this specifies a count (of times the term must appear in the document); if this is a double in [0,1), then this specifies a fraction (out of the document's token count)",
+    "paramRequired": false
+  },
+  {
+    "paramName": "s",
+    "paramLongName": "vocabSize",
+    "paramDescription": "size of the vocabulary",
+    "paramRequired": false
+  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of partitions for the similarity relations intermediate phases",
+    "paramRequired": false
+  }
+]
--- a/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json
+++ b/dnet-and-test/src/main/resources/jobs/parameters/ldaTuning_parameters.json
@ -0,0 +1,38 @@
+[
+  {
+    "paramName": "w",
+    "paramLongName": "workingPath",
+    "paramDescription": "path of the working directory",
+    "paramRequired": true
+  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of partitions for the similarity relations intermediate phases",
+    "paramRequired": false
+  },
+  {
+    "paramName": "tr",
+    "paramLongName": "trainRatio",
+    "paramDescription": "dataset percentage to be used as training set, the remaining part is the test set",
+    "paramRequired": true
+  },
+  {
+    "paramName": "nt",
+    "paramLongName": "numTopics",
+    "paramDescription": "comma separated number of topics to tune the model",
+    "paramRequired": true
+  },
+  {
+    "paramName": "mi",
+    "paramLongName": "maxIterations",
+    "paramDescription": "maximum number of iteration of the algorithm",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "outputModelPath",
+    "paramDescription": "best model in terms of perplexity",
+    "paramRequired": true
+  }
+]
--- a/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json
+++ b/dnet-and-test/src/main/resources/jobs/parameters/tokenizer_parameters.json
@ -0,0 +1,26 @@
+[
+  {
+    "paramName": "i",
+    "paramLongName": "entitiesPath",
+    "paramDescription": "the input data: entities that should be tokenized",
+    "paramRequired": true
+  },
+  {
+    "paramName": "f",
+    "paramLongName": "inputFieldJPath",
+    "paramDescription": "the jpath of the field to be tokenized",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workingPath",
+    "paramDescription": "path of the working directory",
+    "paramRequired": true
+  },
+  {
+    "paramName": "np",
+    "paramLongName": "numPartitions",
+    "paramDescription": "number of partitions for the similarity relations intermediate phases",
+    "paramRequired": false
+  }
+]
--- a/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml
+++ b/dnet-and-test/src/main/resources/lda_tuning/oozie_app/workflow.xml
@ -1,4 +1,4 @@
-<workflow-app name="Deduplication WF" xmlns="uri:oozie:workflow:0.5">
+<workflow-app name="LDA Tuning WF" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>entitiesPath</name>
@ -13,8 +13,32 @@
            <description>number of partitions for the spark files</description>
        </property>
        <property>
-            <name>dedupConfPath</name>
-            <description>path for the dedup configuration file</description>
+            <name>inputFieldJPath</name>
+            <description>json path of the input field in the entities</description>
+        </property>
+        <property>
+            <name>vocabularyPath</name>
+            <description>location of the vocabulary</description>
+        </property>
+        <property>
+            <name>vocabularyType</name>
+            <description>type of the vocabulary: file or tokens</description>
+        </property>
+        <property>
+            <name>trainRatio</name>
+            <description>percentage of the data to be used as training set</description>
+        </property>
+        <property>
+            <name>numTopics</name>
+            <description>number of topics to which test the LDA model</description>
+        </property>
+        <property>
+            <name>maxIterations</name>
+            <description>maximum number of iterations of the LDA algorithm</description>
+        </property>
+        <property>
+            <name>outputModelPath</name>
+            <description>location of the best LDA model</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
@ -81,22 +105,22 @@
        <fs>
            <delete path="${workingPath}"/>
        </fs>
-        <ok to="CreateSimRels"/>
+        <ok to="TokenizeData"/>
        <error to="Kill"/>
    </action>

-    <action name="CreateSimRels">
+    <action name="TokenizeData">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Create Similarity Relations</name>
-            <class>eu.dnetlib.jobs.SparkCreateSimRels</class>
-            <jar>dnet-dedup-test-${projectVersion}.jar</jar>
+            <name>Tokenize Data</name>
+            <class>eu.dnetlib.jobs.SparkTokenizer</class>
+            <jar>dnet-and-test-${projectVersion}.jar</jar>
            <spark-opts>
                --num-executors=32
-                --executor-memory=12G
-                --executor-cores=4
-                --driver-memory=4G
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -105,27 +129,26 @@
                --conf spark.dynamicAllocation.enabled=false
            </spark-opts>
            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
+            <arg>--inputFieldJPath</arg><arg>${inputFieldJPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
-            <arg>--useTree</arg><arg>${useTree}</arg>
        </spark>
-        <ok to="CreateMergeRels"/>
+        <ok to="CreateVocabulary"/>
        <error to="Kill"/>
    </action>

-    <action name="CreateMergeRels">
+    <action name="CreateVocabulary">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
-            <name>Create Merge Relations</name>
-            <class>eu.dnetlib.jobs.SparkCreateMergeRels</class>
-            <jar>dnet-dedup-test-${projectVersion}.jar</jar>
+            <name>Create Vocabulary</name>
+            <class>eu.dnetlib.jobs.SparkCreateVocabulary</class>
+            <jar>dnet-and-test-${projectVersion}.jar</jar>
            <spark-opts>
                --num-executors=32
-                --executor-memory=12G
-                --executor-cores=4
-                --driver-memory=4G
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -133,22 +156,22 @@
                --conf spark.sql.shuffle.partitions=3840
                --conf spark.dynamicAllocation.enabled=true
            </spark-opts>
-            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
+            <arg>--vocabularyPath</arg><arg>${vocabularyPath}</arg>
+            <arg>--vocabularyType</arg><arg>${vocabularyType}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
-            <arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
        </spark>
-        <ok to="ComputeStatistics"/>
+        <ok to="CreateCountVectors"/>
        <error to="Kill"/>
    </action>

-    <action name="ComputeStatistics">
+    <action name="CreateCountVectors">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Compute Statistics</name>
-            <class>eu.dnetlib.jobs.SparkComputeStatistics</class>
-            <jar>dnet-dedup-test-${projectVersion}.jar</jar>
+            <class>eu.dnetlib.jobs.SparkCountVectorizer</class>
+            <jar>dnet-and-test-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
@ -159,7 +182,36 @@
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
-            <arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
+            <arg>--vocabularyPath</arg><arg>${vocabularyPath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
+        </spark>
+        <ok to="LDATuning"/>
+        <error to="Kill"/>
+    </action>
+
+    <action name="LDATuning">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>LDA Tuning</name>
+            <class>eu.dnetlib.jobs.SparkLDATuning</class>
+            <jar>dnet-and-test-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.shuffle.partitions=3840
+            </spark-opts>
+
+            <arg>--trainRatio</arg><arg>${trainRatio}</arg>
+            <arg>--numTopics</arg><arg>${numTopics}</arg>
+            <arg>--maxIterations</arg><arg>${maxIterations}</arg>
+            <arg>--outputModelPath</arg><arg>${outputModelPath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--numPartitions</arg><arg>${numPartitions}</arg>
        </spark>
@ -167,31 +219,5 @@
        <error to="Kill"/>
    </action>

-    <!--<action name="CreateDedupEntities">-->
-        <!--<spark xmlns="uri:oozie:spark-action:0.2">-->
-            <!--<master>yarn</master>-->
-            <!--<mode>cluster</mode>-->
-            <!--<name>Create Dedup Entities</name>-->
-            <!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
-            <!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
-            <!--<spark-opts>-->
-                <!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
-                <!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
-                <!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
-                <!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
-                <!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
-                <!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
-                <!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
-                <!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
-            <!--</spark-opts>-->
-            <!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
-            <!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
-            <!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
-            <!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
-        <!--</spark>-->
-        <!--<ok to="End"/>-->
-        <!--<error to="Kill"/>-->
-    <!--</action>-->
-
    <end name="End"/>
 </workflow-app>
--- a/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java
+++ b/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java
@ -1,6 +1,7 @@
 package eu.dnetlib.jobs;

 import eu.dnetlib.support.ArgumentApplicationParser;
+import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
@ -8,6 +9,7 @@ import org.junit.jupiter.api.*;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.junit.jupiter.MockitoExtension;

+import java.io.File;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.file.Paths;
@ -15,11 +17,14 @@ import java.nio.file.Paths;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
-public class SparkJobsTest {
+public class LDAAnalysisTest {

    static SparkSession spark;
    static JavaSparkContext context;
    final static String workingPath = "/tmp/working_dir";
+    final static String tokensPath = workingPath + "/tokens";
+    final static String vocabularyPath = workingPath + "/vocabulary";
+    final static String bestLDAModelPath = workingPath + "/bestLDAmodel";
    final static String numPartitions = "20";
    final String inputDataPath = Paths
            .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI())
@ -27,12 +32,11 @@ public class SparkJobsTest {
            .getAbsolutePath();
    final static String inputFieldJPath = "$.description[0].value";

-    public SparkJobsTest() throws URISyntaxException {}
+    public LDAAnalysisTest() throws URISyntaxException {}

    public static void cleanup() throws IOException {
        //remove directories and clean workspace
-        //TODO add directories to be removed
-        //FileUtils.deleteDirectory(new File(path));
+        FileUtils.deleteDirectory(new File(workingPath));
    }

    @BeforeAll
@ -48,10 +52,10 @@ public class SparkJobsTest {
        context = JavaSparkContext.fromSparkContext(spark.sparkContext());
    }

-    @AfterAll
-    public static void finalCleanUp() throws IOException {
-        cleanup();
-    }
+//    @AfterAll
+//    public static void finalCleanUp() throws IOException {
+//        cleanup();
+//    }

    @Test
    @Order(1)
@ -75,6 +79,69 @@ public class SparkJobsTest {

    }

+    @Test
+    @Order(2)
+    public void createVocabularyTest() throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class));
+
+        parser.parseArgument(
+                new String[] {
+                        "-w", workingPath,
+                        "-v", vocabularyPath,
+                        "-vt", "file"
+                }
+        );
+
+        new SparkCreateVocabulary(
+                parser,
+                spark
+        ).run();
+    }
+
+    @Test
+    @Order(3)
+    public void countVectorizeTest() throws Exception {
+
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class));
+
+        parser.parseArgument(
+                new String[]{
+                        "-w", workingPath,
+                        "-v", vocabularyPath,
+                        "-np", numPartitions
+                }
+        );
+
+        new SparkCountVectorizer(
+                parser,
+                spark
+        ).run();
+
+    }
+
+    @Test
+    @Order(4)
+    public void ldaTuningTest() throws Exception {
+        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class));
+
+        parser.parseArgument(
+                new String[]{
+                        "-w", workingPath,
+                        "-np", numPartitions,
+                        "-tr", "0.8",
+                        "-nt", "2,3,4,5",
+                        "-mi", "5",
+                        "-o", bestLDAModelPath
+                });
+
+        new SparkLDATuning(
+                parser,
+                spark
+        ).run();
+
+    }
+
    public static String readResource(String path, Class<? extends AbstractSparkJob> clazz) throws IOException {
        return IOUtils.toString(clazz.getResourceAsStream(path));
    }
--- a/pom.xml
+++ b/pom.xml
@ -1,17 +1,447 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
    <modelVersion>4.0.0</modelVersion>

-    <groupId>org.example</groupId>
+    <groupId>eu.dnetlib</groupId>
    <artifactId>dnet-and</artifactId>
-    <version>1.0-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
+
+    <packaging>pom</packaging>
+
+    <url>http://www.d-net.research-infrastructures.eu</url>
+
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+            <comments>A business-friendly OSS license</comments>
+        </license>
+    </licenses>
+
+    <scm>
+        <developerConnection>scm:git:https://code-repo.d4science.org/michele.debonis/dnet-and.git</developerConnection>
+        <tag>dnet-and-1.0.0</tag>
+    </scm>
+
+    <modules>
+        <module>dhp-build</module>
+        <module>dnet-feature-extraction</module>
+        <module>dnet-and-test</module>
+    </modules>
+
+    <issueManagement>
+        <system>Redmine</system>
+        <url>https://issue.openaire.research-infrastructures.eu/projects/openaire</url>
+    </issueManagement>
+
+    <distributionManagement>
+        <repository>
+            <id>dnet45-releases</id>
+            <name>D-Net 45 Releases</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
+            <layout>default</layout>
+        </repository>
+    </distributionManagement>
+
+    <repositories>
+        <repository>
+            <id>dnet-deps</id>
+            <name>dnet-dependencies</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet-deps</url>
+            <layout>default</layout>
+        </repository>
+        <repository>
+            <id>dnet45-releases</id>
+            <name>D-Net 45 Releases</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
+            <layout>default</layout>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>dnet45-snapshots</id>
+            <name>D-Net 45 Snapshots</name>
+            <url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
+            <layout>default</layout>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+
+        <repository>
+            <id>cloudera</id>
+            <name>Cloudera Repository</name>
+            <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+
+        <repository>
+            <id>ceon</id>
+            <name>Ceon Repository</name>
+            <url>https://maven.ceon.pl/artifactory/repo</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+    <build>
+        <directory>target</directory>
+        <outputDirectory>target/classes</outputDirectory>
+        <finalName>${project.artifactId}-${project.version}</finalName>
+        <testOutputDirectory>target/test-classes</testOutputDirectory>
+
+
+
+
+        <!--*************************************************-->
+
+        <pluginManagement>
+            <plugins>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-compiler-plugin</artifactId>
+                    <version>3.6.0</version>
+                    <configuration>
+                        <source>1.8</source>
+                        <target>1.8</target>
+                        <encoding>${project.build.sourceEncoding}</encoding>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <version>3.0.2</version>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-source-plugin</artifactId>
+                    <version>3.0.1</version>
+                    <executions>
+                        <execution>
+                            <id>attach-sources</id>
+                            <phase>verify</phase>
+                            <goals>
+                                <goal>jar-no-fork</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.19.1</version>
+                    <dependencies>
+                        <dependency>
+                            <groupId>org.junit.jupiter</groupId>
+                            <artifactId>junit-jupiter</artifactId>
+                            <version>${junit-jupiter.version}</version>
+                        </dependency>
+                    </dependencies>
+                    <configuration>
+                        <redirectTestOutputToFile>false</redirectTestOutputToFile>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-javadoc-plugin</artifactId>
+                    <version>2.10.4</version>
+                    <configuration>
+                        <detectLinks>true</detectLinks>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-dependency-plugin</artifactId>
+                    <version>3.0.0</version>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-failsafe-plugin</artifactId>
+                    <version>2.13</version>
+                    <executions>
+                        <execution>
+                            <id>integration-test</id>
+                            <goals>
+                                <goal>integration-test</goal>
+                            </goals>
+                        </execution>
+                        <execution>
+                            <id>verify</id>
+                            <goals>
+                                <goal>verify</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+
+<!--                <plugin>-->
+<!--                    <groupId>org.apache.maven.plugins</groupId>-->
+<!--                    <artifactId>maven-plugin-plugin</artifactId>-->
+<!--                    <version>3.7.1</version>-->
+<!--                    <configuration>-->
+<!--                                &lt;!&ndash; see http://jira.codehaus.org/browse/MNG-5346 &ndash;&gt;-->
+<!--                        <skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>-->
+<!--                    </configuration>-->
+
+<!--                    <executions>-->
+<!--                        <execution>-->
+<!--                            <id>mojo-descriptor</id>-->
+<!--                            <goals>-->
+<!--                                <goal>descriptor</goal>-->
+<!--                            </goals>-->
+<!--                        </execution>-->
+<!--                    </executions>-->
+<!--                </plugin>-->
+
+            </plugins>
+        </pluginManagement>
+
+        <plugins>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-release-plugin</artifactId>
+                <version>2.5.3</version>
+            </plugin>
+
+        </plugins>
+    </build>
+
+    <pluginRepositories>
+        <pluginRepository>
+            <id>iis-releases</id>
+            <name>iis releases plugin repository</name>
+            <url>http://maven.ceon.pl/artifactory/iis-releases</url>
+            <layout>default</layout>
+        </pluginRepository>
+    </pluginRepositories>

    <properties>
-        <maven.compiler.source>8</maven.compiler.source>
-        <maven.compiler.target>8</maven.compiler.target>
+
+        <oozie.package.file.name>oozie-package</oozie.package.file.name>
+        <workflow.source.dir>src/test/resources/define/path/pointing/to/directory/holding/oozie_app</workflow.source.dir>
+        <oozieAppDir>oozie_app</oozieAppDir>
+
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+
+        <google.gson.version>2.2.2</google.gson.version>
+        <google.guava.version>15.0</google.guava.version>
+
+        <spark.version>2.2.0</spark.version>
+        <sparknlp.version>2.5.5</sparknlp.version>
+        <jackson.version>2.6.5</jackson.version>
+        <mockito-core.version>3.3.3</mockito-core.version>
+
+        <commons.lang.version>3.5</commons.lang.version>
+        <commons.io.version>2.4</commons.io.version>
+        <commons.collections.version>3.2.1</commons.collections.version>
+        <commons.logging.version>1.1.3</commons.logging.version>
+
+        <junit.version>4.9</junit.version>
+        <scala.version>2.11.8</scala.version>
+
+        <maven.javadoc.failOnError>false</maven.javadoc.failOnError>
+        <maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
+
+        <!-- from dhp-workflows -->
+        <queueName>default</queueName>
+        <importerQueueName>default</importerQueueName>
+        <oozieLauncherQueueName>default</oozieLauncherQueueName>
+        <primed.dir>primed</primed.dir>
+
+        <oozie.package.dependencies.include.scope>runtime</oozie.package.dependencies.include.scope>
+        <oozie.package.dependencies.exclude.scope />
+        <oozie.package.skip.test.jar>true</oozie.package.skip.test.jar>
+
+        <dhpConnectionProperties>${user.home}/.dhp/application.properties</dhpConnectionProperties>
+
+        <output.dir.name>${maven.build.timestamp}</output.dir.name>
+
+        <projectVersion>${project.version}</projectVersion>
+        <oozie.use.system.libpath>true</oozie.use.system.libpath>
+        <properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
+        <junit-jupiter.version>5.6.1</junit-jupiter.version>
+        <maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
+
    </properties>
-    
-</project>
+
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>edu.cmu</groupId>
+                <artifactId>secondstring</artifactId>
+                <version>1.0.0</version>
+            </dependency>
+            <dependency>
+                <groupId>org.antlr</groupId>
+                <artifactId>stringtemplate</artifactId>
+                <version>3.2</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-databind</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.dataformat</groupId>
+                <artifactId>jackson-dataformat-xml</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.module</groupId>
+                <artifactId>jackson-module-jsonSchema</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-core</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-annotations</artifactId>
+                <version>${jackson.version}</version>
+                <scope>provided</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-core</artifactId>
+                <version>${mockito-core.version}</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.mockito</groupId>
+                <artifactId>mockito-junit-jupiter</artifactId>
+                <version>${mockito-core.version}</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-math3</artifactId>
+                <version>3.6.1</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.google.guava</groupId>
+                <artifactId>guava</artifactId>
+                <version>${google.guava.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>com.google.code.gson</groupId>
+                <artifactId>gson</artifactId>
+                <version>${google.gson.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.commons</groupId>
+                <artifactId>commons-lang3</artifactId>
+                <version>${commons.lang.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>commons-io</groupId>
+                <artifactId>commons-io</artifactId>
+                <version>${commons.io.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>commons-collections</groupId>
+                <artifactId>commons-collections</artifactId>
+                <version>${commons.collections.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>commons-logging</groupId>
+                <artifactId>commons-logging</artifactId>
+                <version>${commons.logging.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-core_2.11</artifactId>
+                <version>${spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-graphx_2.11</artifactId>
+                <version>${spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-sql_2.11</artifactId>
+                <version>${spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.spark</groupId>
+                <artifactId>spark-mllib_2.11</artifactId>
+                <version>${spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.junit.jupiter</groupId>
+                <artifactId>junit-jupiter</artifactId>
+                <version>${junit-jupiter.version}</version>
+                <scope>test</scope>
+            </dependency>
+
+            <dependency>
+                <groupId>org.reflections</groupId>
+                <artifactId>reflections</artifactId>
+                <version>0.9.10</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.scala-lang</groupId>
+                <artifactId>scala-library</artifactId>
+                <version>${scala.version}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>org.apache.oozie</groupId>
+                <artifactId>oozie-client</artifactId>
+                <version>5.1.0</version>
+            </dependency>
+            <dependency>
+                <groupId>com.jayway.jsonpath</groupId>
+                <artifactId>json-path</artifactId>
+                <version>2.4.0</version>
+            </dependency>
+<!--            <dependency>-->
+<!--                <groupId>com.ibm.icu</groupId>-->
+<!--                <artifactId>icu4j</artifactId>-->
+<!--                <version>70.1</version>-->
+<!--            </dependency>-->
+        </dependencies>
+
+    </dependencyManagement>
+
+    <profiles>
+    </profiles>
+</project>