implementation of the lda tuning workflow

This commit is contained in:
Michele De Bonis 2023-04-03 09:46:55 +02:00
parent 7954c27773
commit 2355f8f65e
16 changed files with 2040 additions and 95 deletions

View File

@ -0,0 +1,10 @@
workingPath = /tmp/lda_working_dir
numPartitions = 1000
entitiesPath = /tmp/publications_with_pid_pubmed
inputFieldJPath = $.description[0].value
vocabularyPath = /tmp/lda_working_dir/essential_science_vocabulary
vocabularyType = file
trainRatio = 0.8
numTopics = 5,10,15,20,25,30,35,40,45,50
maxIterations = 200
outputModelPath = /tmp/lda_working_dir/bestLdaModel

View File

@ -1,20 +1,570 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-and</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dnet-and-test</artifactId>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.7</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<includes>
<include>**/*.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-feature-extraction</artifactId>
<version>${project.version}</version>
</dependency>
<!--Spark Dependencies-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-jsonSchema</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<profiles>
<profile>
<id>oozie-package</id>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>1.4.1</version>
<executions>
<execution>
<id>enforce-connection-properties-file-existence</id>
<phase>initialize</phase>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireFilesExist>
<files>
<file>${dhpConnectionProperties}</file>
</files>
<message>
The file with connection properties could not be found. Please, create the ${dhpConnectionProperties} file or set the location to another already created file by using
-DdhpConnectionProperties property.
</message>
</requireFilesExist>
</rules>
<fail>true</fail>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy dependencies</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<includeScope>${oozie.package.dependencies.include.scope}</includeScope>
<excludeScope>${oozie.package.dependencies.exclude.scope}</excludeScope>
<silent>true</silent>
</configuration>
</execution>
</executions>
</plugin>
<!-- Plugin originally defined in attach-test-resources It was moved here to ensure that it will execute before priming -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<id>attach-test-resources-package</id>
<phase>prepare-package</phase>
<goals>
<goal>test-jar</goal>
</goals>
<configuration>
<skip>${oozie.package.skip.test.jar}</skip>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>eu.dnetlib.primer</groupId>
<artifactId>primer-maven-plugin</artifactId>
<version>1.2.0</version>
<executions>
<execution>
<id>priming</id>
<phase>prepare-package</phase>
<goals>
<goal>prime</goal>
</goals>
<configuration>
<classProviderFiles>
<classProviderFile>${project.build.directory}/dependency/*.jar</classProviderFile>
<classProviderFile>${project.build.directory}/*-tests.jar</classProviderFile>
<classProviderFile>${project.build.directory}/classes</classProviderFile>
</classProviderFiles>
<coansysPackageDir>${project.build.directory}/dependency</coansysPackageDir>
<destination>${project.build.directory}/${primed.dir}</destination>
<classpath>${workflow.source.dir}</classpath>
</configuration>
</execution>
</executions>
</plugin>
<!-- reading job.properties to use them in .sh scripts -->
<plugin>
<groupId>org.kuali.maven.plugins</groupId>
<artifactId>properties-maven-plugin</artifactId>
<version>${properties.maven.plugin.version}</version>
<dependencies>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build-assembly-resources</artifactId>
<version>${project.version}</version>
<!-- contains project-default.properties -->
</dependency>
</dependencies>
<executions>
<execution>
<id>reading-dhp-properties</id>
<phase>initialize</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<locations>
<location>${dhpConnectionProperties}</location>
</locations>
<quiet>false</quiet>
</configuration>
</execution>
<execution>
<id>read-default-properties</id>
<phase>prepare-package</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<locations>
<location>classpath:project-default.properties</location>
</locations>
<quiet>true</quiet>
</configuration>
</execution>
<execution>
<id>read-job-properties</id>
<phase>prepare-package</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<locations>
<param>${project.build.directory}/${primed.dir}/job.properties</param>
<param>job-override.properties</param>
</locations>
<quiet>true</quiet>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
<version>${project.version}</version>
<executions>
<execution>
<phase>validate</phase>
<goals>
<goal>generate-properties</goal>
<!-- generates sandboxName based on workflow.source.dir when not specified as commandline parameter -->
</goals>
<configuration>
</configuration>
</execution>
<execution>
<id>write-job-properties</id>
<phase>prepare-package</phase>
<goals>
<goal>write-project-properties</goal>
</goals>
<configuration>
<outputFile>target/${oozie.package.file.name}/job.properties</outputFile>
<!-- notice: dots are not allowed for job.properties! -->
<include>nameNode,jobTracker,queueName,importerQueueName,oozieLauncherQueueName,
workingDir,oozieTopWfApplicationPath,oozieServiceLoc,
sparkDriverMemory,sparkExecutorMemory,sparkExecutorCores,
oozie.wf.application.path,projectVersion,oozie.use.system.libpath,
oozieActionShareLibForSpark1,spark1YarnHistoryServerAddress,spark1EventLogDir,
oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir,
sparkSqlWarehouseDir
</include>
<includeSystemProperties>true</includeSystemProperties>
<includePropertyKeysFromFiles>
<!-- <param>${workflow.source.dir}/job.properties</param> -->
<param>${project.build.directory}/${primed.dir}/job.properties</param>
<param>job-override.properties</param>
</includePropertyKeysFromFiles>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
</execution>
</executions>
<configuration>
<verbose>true</verbose>
<dateFormat>yyyy-MM-dd'T'HH:mm:ssZ</dateFormat>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
<generateGitPropertiesFilename>target/${oozie.package.file.name}/${oozieAppDir}/version.properties</generateGitPropertiesFilename>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<dependencies>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build-assembly-resources</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<executions>
<execution>
<id>assembly-oozie-installer</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<finalName>${oozie.package.file.name}_shell_scripts</finalName>
<descriptorRefs>
<descriptorRef>oozie-installer</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- this plugin prepares oozie installer package-->
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<!-- extracting shared resources phase -->
<execution>
<id>installer-copy-custom</id>
<phase>process-resources</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<tasks>
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
</tasks>
</configuration>
</execution>
<!-- packaging phase -->
<execution>
<phase>package</phase>
<configuration>
<tasks>
<!-- copying workflow resources -->
<mkdir dir="target/${oozie.package.file.name}" />
<mkdir dir="target/${oozie.package.file.name}/${oozieAppDir}" />
<copy todir="target/${oozie.package.file.name}/${oozieAppDir}">
<!-- <fileset dir="${workflow.source.dir}/${oozieAppDir}" /> replacing with primed dir location -->
<fileset dir="target/${primed.dir}/${oozieAppDir}" />
</copy>
<!-- copying all jars to oozie lib directory -->
<mkdir dir="target/${oozie.package.file.name}/${oozieAppDir}/lib" />
<copy todir="target/${oozie.package.file.name}/${oozieAppDir}/lib">
<fileset dir="${project.build.directory}/dependency" />
</copy>
<!-- copying current module lib -->
<copy todir="target/${oozie.package.file.name}/${oozieAppDir}/lib">
<fileset dir="${project.build.directory}">
<include name="*.jar" />
</fileset>
</copy>
<fixcrlf srcdir="target/${oozie.package.file.name}/${oozieAppDir}/" encoding="UTF-8" outputencoding="UTF-8" includes="**/*.sh,**/*.json,**/*.py,**/*.sql" eol="lf" />
<!-- creating tar.gz package -->
<tar destfile="target/${oozie.package.file.name}.tar.gz" compression="gzip" longfile="gnu">
<tarfileset dir="target/${oozie.package.file.name}" />
<tarfileset dir="target/${oozie.package.file.name}_shell_scripts" filemode="0755">
<include name="**/*.sh" />
</tarfileset>
<tarfileset dir="target/${oozie.package.file.name}_shell_scripts" filemode="0644">
<exclude name="**/*.sh" />
</tarfileset>
</tar>
<!-- cleanup -->
<delete dir="target/${oozie.package.file.name}" />
<delete dir="target/${oozie.package.file.name}_shell_scripts" />
</tasks>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>deploy</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.5.0</version>
<executions>
<execution>
<id>create-target-dir</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>ssh</executable>
<arguments>
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument>
<argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>upload-oozie-package</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>scp</executable>
<arguments>
<argument>-P ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument>
<argument>target/${oozie.package.file.name}.tar.gz</argument>
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>extract-and-upload-to-hdfs</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>ssh</executable>
<!-- <outputFile>target/redirected_upload.log</outputFile> -->
<arguments>
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument>
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
<argument>tar -zxf oozie-package.tar.gz; </argument>
<argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument>
<argument>./upload_workflow.sh</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>run</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.5.0</version>
<executions>
<execution>
<id>run-job</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>ssh</executable>
<!-- this file will be used by test verification profile reading job identifier -->
<outputFile>${oozie.execution.log.file.location}</outputFile>
<arguments>
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument>
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
<argument>./run_workflow.sh</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>show-run-log-on-stdout</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>cat</executable>
<arguments>
<argument>${oozie.execution.log.file.location}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>

View File

@ -2,12 +2,9 @@ package eu.dnetlib.jobs;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
@ -49,12 +46,6 @@ public abstract class AbstractSparkJob implements Serializable {
dataset.write().option("compression", "gzip").mode(mode).json(outPath);
}
protected static DedupConfig loadDedupConfig(String dedupConfPath) throws IOException {
return DedupConfig.load(
readFileFromHDFS(dedupConfPath)
);
}
protected static String readFileFromHDFS(String filePath) throws IOException {
Path path=new Path(filePath);

View File

@ -1,2 +1,71 @@
package eu.dnetlib.jobs;public class SparkCountVectorizer {
package eu.dnetlib.jobs;
import eu.dnetlib.featureextraction.FeatureTransformer;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.hadoop.fs.shell.Count;
import org.apache.spark.SparkConf;
import org.apache.spark.ml.feature.CountVectorizerModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Optional;
public class SparkCountVectorizer extends AbstractSparkJob{
private static final Logger log = LoggerFactory.getLogger(SparkCountVectorizer.class);
public SparkCountVectorizer(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCountVectorizer(
parser,
getSparkSession(conf)
).run();
}
@Override
public void run() throws IOException {
// read oozie parameters
final String workingPath = parser.get("workingPath");
final String vocabularyPath = parser.get("vocabularyPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("workingPath: '{}'", workingPath);
log.info("vocabularyPath: '{}'", vocabularyPath);
log.info("numPartitions: '{}'", numPartitions);
//read input tokens
Dataset<Row> inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions);
//read vocabulary
CountVectorizerModel vocabulary = FeatureTransformer.loadVocabulary(vocabularyPath);
Dataset<Row> countVectorizedData = FeatureTransformer.countVectorizeData(inputTokensDS, vocabulary);
countVectorizedData
.write()
.mode(SaveMode.Overwrite)
.save(workingPath + "/countVectorized");
}
}

View File

@ -1,2 +1,94 @@
package eu.dnetlib.jobs;public class SparkCreateVocabulary {
package eu.dnetlib.jobs;
import eu.dnetlib.featureextraction.FeatureTransformer;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.spark.SparkConf;
import org.apache.spark.ml.feature.CountVectorizerModel;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.Optional;
public class SparkCreateVocabulary extends AbstractSparkJob{
final static int VOCAB_SIZE = 1<<18;
final static double MIN_DF = 0.1;
final static double MIN_TF = 1;
private static final Logger log = LoggerFactory.getLogger(SparkCreateVocabulary.class);
public SparkCreateVocabulary(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCreateVocabulary(
parser,
getSparkSession(conf)
).run();
}
@Override
public void run() throws IOException {
// read oozie parameters
final String workingPath = parser.get("workingPath");
final String vocabularyPath = parser.get("vocabularyPath");
final String vocabularyType = parser.get("vocabularyType"); //from file or from tokens
final double minDF = Optional
.ofNullable(parser.get("minDF"))
.map(Double::valueOf)
.orElse(MIN_DF);
final double minTF = Optional
.ofNullable(parser.get("minTF"))
.map(Double::valueOf)
.orElse(MIN_TF);
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
final int vocabSize = Optional
.ofNullable(parser.get("vocabSize"))
.map(Integer::valueOf)
.orElse(VOCAB_SIZE);
log.info("workingPath: '{}'", workingPath);
log.info("vocabularyPath: '{}'", vocabularyPath);
log.info("vocabularyType: '{}'", vocabularyType);
log.info("minDF: '{}'", minDF);
log.info("minTF: '{}'", minTF);
log.info("vocabSize: '{}'", vocabSize);
Dataset<Row> inputTokensDS = spark.read().load(workingPath + "/tokens").repartition(numPartitions);
CountVectorizerModel vocabulary;
if (vocabularyType.equals("file")) {
try {
vocabulary = FeatureTransformer.createVocabularyFromFile(Paths
.get(getClass().getResource("/eu/dnetlib/jobs/support/vocabulary_words.txt").toURI())
.toFile()
.getAbsolutePath());
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
else {
vocabulary = FeatureTransformer.createVocabularyFromTokens(inputTokensDS, minDF, minTF, vocabSize);
}
vocabulary.write().overwrite().save(vocabularyPath);
}
}

View File

@ -1,11 +1,90 @@
package eu.dnetlib.jobs;
import java.io.IOException;
import eu.dnetlib.featureextraction.FeatureTransformer;
import eu.dnetlib.featureextraction.util.Utilities;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.spark.SparkConf;
import org.apache.spark.ml.clustering.LDAModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
public class SparkLDAHyperparametersTuning extends AbstractSparkJob{
import java.io.IOException;
import java.util.*;
import java.util.stream.Stream;
public class SparkLDATuning extends AbstractSparkJob{
private static final Logger log = LoggerFactory.getLogger(SparkLDATuning.class);
public SparkLDATuning(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkLDATuning(
parser,
getSparkSession(conf)
).run();
}
@Override
void run() throws IOException {
public void run() throws IOException {
// read oozie parameters
final String workingPath = parser.get("workingPath");
final int maxIterations = Integer.parseInt(parser.get("maxIterations"));
final double trainRatio = Double.parseDouble(parser.get("trainRatio"));
int[] numTopics = Arrays.stream(parser.get("numTopics").split(",")).mapToInt(s -> Integer.parseInt(s)).toArray();
final String outputModelPath = parser.get("outputModelPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
log.info("maxIterations: '{}'", maxIterations);
log.info("numTopics: '{}'", numTopics.toString());
log.info("trainRatio: '{}'", trainRatio);
log.info("outputModelPath: '{}'", outputModelPath);
Dataset<Row> inputFeaturesDS = spark.read().load(workingPath + "/countVectorized");
Map<Integer, Tuple2<LDAModel, Double>> ldaModels =
FeatureTransformer.ldaTuning(inputFeaturesDS, trainRatio, numTopics, maxIterations);
double bestPerplexity = 100L;
LDAModel bestModel = null;
List<String> stats = new ArrayList<>();
stats.add("k,perplexity,path");
for(Integer k: ldaModels.keySet()) {
//save LDAModel
ldaModels.get(k)._1().write().overwrite().save(workingPath + "/lda_model_k" + k);
//prepare line
stats.add(k + "," + ldaModels.get(k)._2() + "," + workingPath + "/lda_model_k" + k);
//pick the best model
bestModel = (ldaModels.get(k)._2() <= bestPerplexity)? ldaModels.get(k)._1() : bestModel;
bestPerplexity = Math.min(ldaModels.get(k)._2(), bestPerplexity);
}
bestModel.write().overwrite().save(outputModelPath);
Utilities.writeLinesToHDFSFile(stats, workingPath + "/perplexity_stats.csv");
}
}

View File

@ -1,2 +1,69 @@
package eu.dnetlib.jobs;public class SparkTokenizer {
package eu.dnetlib.jobs;
import eu.dnetlib.featureextraction.FeatureTransformer;
import eu.dnetlib.featureextraction.util.Utilities;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.ws.rs.core.Feature;
import java.io.IOException;
import java.util.Optional;
public class SparkTokenizer extends AbstractSparkJob {
private static final Logger log = LoggerFactory.getLogger(SparkTokenizer.class);
public SparkTokenizer(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
readResource("/jobs/parameters/tokenizer_parameters.json", SparkTokenizer.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkTokenizer(
parser,
getSparkSession(conf)
).run();
}
@Override
public void run() throws IOException {
// read oozie parameters
final String entitiesPath = parser.get("entitiesPath");
final String workingPath = parser.get("workingPath");
final String inputFieldJPath = parser.get("inputFieldJPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("inputField: '{}'", inputFieldJPath);
log.info("numPartitions: '{}'", numPartitions);
JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext());
SQLContext sqlContext = SQLContext.getOrCreate(spark.sparkContext());
Dataset<Row> inputDS = Utilities.prepareDataset(sqlContext, context.textFile(entitiesPath).repartition(numPartitions), inputFieldJPath);
Dataset<Row> tokensDS = FeatureTransformer.tokenizeData(inputDS);
tokensDS
.write()
.mode(SaveMode.Overwrite)
.save(workingPath + "/tokens");
}
}

View File

@ -68,7 +68,7 @@ public class ArgumentApplicationParser implements Serializable {
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());
gzip.close();
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
return Base64.getEncoder().encodeToString(out.toByteArray());
}
public void parseArgument(final String[] args) throws Exception {

View File

@ -0,0 +1,436 @@
hypothesis
experiment
control
model
graph
precision
accuracy
assumption
procedure
observation
inference
method
framework
data
prediction
quantitative
qualitative
bias
constant
variable
chart
trend
empirical
evidence
cell
chromosome
cellulose
chloroplast
cytoplasm
diffusion
lysosome
meiosis
membrane
mitochondrion
mitosis
nucleolus
nucleus
organelle
osmosis
permeable
photosynthesis
respiration
ribosome
vacuole
amphibian
arthropod
bacteria
cold-blooded
domain
eukaryote
family
fungus
genus
invertebrate
kingdom
mammal
order
phylum
prokaryote
reptile
vertebrate
virus
blood
warm-blooded
annual
bulb
chlorophyll
germinate
germination
leaf
perennial
phloem
phototropism
pollen
pollinate
root
seed
stamen
stoma
transpiration
xylem
circulation
digestion
digestive
endocrine
excretion
homeostasis
hormone
immune
immunize
infection
lymphatic
metabolism
nervous
nutrition
pathogen
reproduction
respiratory
vaccination
aorta
artery
brain
capillary
cardiac
cartilage
cerebellum
cerebrum
cranium
epidermis
esophagus
femur
gland
hemoglobin
involuntary
joint
ligament
muscle
nerve
neuron
organ
plasma
platelet
skeleton
sternum
synapse
tendon
tissue
vein
ventricle
vertebra
voluntary
autotrophic
biome
camouflage
carnivore
commensalism
community
competition
consumer
decomposer
habitat
herbivore
heterotrophic
host
interdependent
migration
mutualism
niche
nutrient
omnivore
organism
parasite
parasitism
population
predator
prey
producer
scavenger
succession
symbiosis
adaptation
allele
clone
dominant
extinction
gene
genome
genotype
heredity
heterozygous
homologous
homozygous
hybrid
inherit
mutation
natural
selection
offspring
phenotype
probability
recessive
species
trait
variation
altitude
core
crust
deposition
elevation
epoch
equator
era
erosion
fossil
geology
hydrosphere
igneous
lithosphere
mantle
metamorphic
paleontology
petrifaction
prehistoric
sedimentary
sedimentation
stratum
tide
aftershock
canyon
continent
continental
drift
desert
dormant
earthquake
epicenter
eruption
fault
geyser
glacier
iceberg
lava
magma
molten
plate
tectonics
plateau
ridge
rift
savanna
seismic
seismograph
subduction
tundra
volcano
watershed
barometer
blizzard
climate
change
condensation
convection
current
cyclone
desertification
drought
evaporation
front
humidity
hurricane
meteorology
monsoon
precipitation
pressure
sleet
temperature
thermometer
tornado
tropical
tsunami
weather
aquifer
biodegradable
biodiversity
biomass
biosphere
conservation
decay
deforestation
depletion
ecology
extraction
fission
fuel
fracking
geothermal
global
warming
irrigation
landfill
mineral
resource
ozone
pesticide
petroleum
pollutant
pollution
reclamation
recycle
renewable
reservoir
salinity
sustainable
turbine
apogee
asteroid
astronomy
atmosphere
axis
constellation
comet
corona
eclipse
elliptical
galaxy
luminosity
lunar
meteor
meteorite
nadir
nebula
observatory
orbit
perigee
pulsar
quasar
solar
stellar
supernova
vacuum
wane
zenith
alloy
anion
atom
bond
cation
compound
density
ductile
electron
element
gas
ion
isotope
liquid
malleable
mass
metal
metalloid
molecule
neutron
nonmetal
polar
proton
solid
substance
volume
acid
base
catalyst
concentration
dissolve
enzyme
oxidation
precipitate
reactant
reaction
saturate
solubility
solute
solution
solvent
substrate
synthesis
conduction
endothermic
energy
entropy
equilibrium
exothermic
heat
insulation
matter
nuclear
thermal
acceleration
axle
centripetal
deceleration
force
friction
fulcrum
gravity
inclined
inertia
kinetic
lever
machine
momentum
motion
potential
power
pulley
screw
speed
tensile
torque
velocity
wedge
amplitude
circuit
compression
crest
diffraction
emission
frequency
magnet
medium
particle
period
pole
radiation
rarefaction
reflection
refraction
spectrum
trough
ultraviolet
wavelength
x-ray
ray
biology
taxonomy
plant
ecosystem
genetic
evolution
geologic
feature
environment
space
physic
wave
electricity
magnetism

View File

@ -0,0 +1,20 @@
[
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "location of the working directory",
"paramRequired": true
},
{
"paramName": "v",
"paramLongName": "vocabularyPath",
"paramDescription": "location to store the vocabulary",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
}
]

View File

@ -0,0 +1,44 @@
[
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "location of the working directory",
"paramRequired": true
},
{
"paramName": "v",
"paramLongName": "vocabularyPath",
"paramDescription": "location to store the vocabulary",
"paramRequired": true
},
{
"paramName": "vt",
"paramLongName": "vocabularyType",
"paramDescription": "type of vocabulary: it could ben 'tokens' if generated with tokens or 'file' if generated from file of words",
"paramRequired": true
},
{
"paramName": "md",
"paramLongName": "minDF",
"paramDescription": "specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer greater than or equal to 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents",
"paramRequired": false
},
{
"paramName": "mt",
"paramLongName": "minTF",
"paramDescription": "filter to ignore rare words in a document. For each document, terms with frequency/count less than the given threshold are ignored. If this is an integer greater than or equal to 1, then this specifies a count (of times the term must appear in the document); if this is a double in [0,1), then this specifies a fraction (out of the document's token count)",
"paramRequired": false
},
{
"paramName": "s",
"paramLongName": "vocabSize",
"paramDescription": "size of the vocabulary",
"paramRequired": false
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
}
]

View File

@ -0,0 +1,38 @@
[
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "tr",
"paramLongName": "trainRatio",
"paramDescription": "dataset percentage to be used as training set, the remaining part is the test set",
"paramRequired": true
},
{
"paramName": "nt",
"paramLongName": "numTopics",
"paramDescription": "comma separated number of topics to tune the model",
"paramRequired": true
},
{
"paramName": "mi",
"paramLongName": "maxIterations",
"paramDescription": "maximum number of iteration of the algorithm",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputModelPath",
"paramDescription": "best model in terms of perplexity",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "i",
"paramLongName": "entitiesPath",
"paramDescription": "the input data: entities that should be tokenized",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "inputFieldJPath",
"paramDescription": "the jpath of the field to be tokenized",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
}
]

View File

@ -1,4 +1,4 @@
<workflow-app name="Deduplication WF" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="LDA Tuning WF" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>entitiesPath</name>
@ -13,8 +13,32 @@
<description>number of partitions for the spark files</description>
</property>
<property>
<name>dedupConfPath</name>
<description>path for the dedup configuration file</description>
<name>inputFieldJPath</name>
<description>json path of the input field in the entities</description>
</property>
<property>
<name>vocabularyPath</name>
<description>location of the vocabulary</description>
</property>
<property>
<name>vocabularyType</name>
<description>type of the vocabulary: file or tokens</description>
</property>
<property>
<name>trainRatio</name>
<description>percentage of the data to be used as training set</description>
</property>
<property>
<name>numTopics</name>
<description>number of topics to which test the LDA model</description>
</property>
<property>
<name>maxIterations</name>
<description>maximum number of iterations of the LDA algorithm</description>
</property>
<property>
<name>outputModelPath</name>
<description>location of the best LDA model</description>
</property>
<property>
<name>sparkDriverMemory</name>
@ -81,22 +105,22 @@
<fs>
<delete path="${workingPath}"/>
</fs>
<ok to="CreateSimRels"/>
<ok to="TokenizeData"/>
<error to="Kill"/>
</action>
<action name="CreateSimRels">
<action name="TokenizeData">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.jobs.SparkCreateSimRels</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<name>Tokenize Data</name>
<class>eu.dnetlib.jobs.SparkTokenizer</class>
<jar>dnet-and-test-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=32
--executor-memory=12G
--executor-cores=4
--driver-memory=4G
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -105,27 +129,26 @@
--conf spark.dynamicAllocation.enabled=false
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--inputFieldJPath</arg><arg>${inputFieldJPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
<arg>--useTree</arg><arg>${useTree}</arg>
</spark>
<ok to="CreateMergeRels"/>
<ok to="CreateVocabulary"/>
<error to="Kill"/>
</action>
<action name="CreateMergeRels">
<action name="CreateVocabulary">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Create Merge Relations</name>
<class>eu.dnetlib.jobs.SparkCreateMergeRels</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<name>Create Vocabulary</name>
<class>eu.dnetlib.jobs.SparkCreateVocabulary</class>
<jar>dnet-and-test-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=32
--executor-memory=12G
--executor-cores=4
--driver-memory=4G
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -133,22 +156,22 @@
--conf spark.sql.shuffle.partitions=3840
--conf spark.dynamicAllocation.enabled=true
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--vocabularyPath</arg><arg>${vocabularyPath}</arg>
<arg>--vocabularyType</arg><arg>${vocabularyType}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="ComputeStatistics"/>
<ok to="CreateCountVectors"/>
<error to="Kill"/>
</action>
<action name="ComputeStatistics">
<action name="CreateCountVectors">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Compute Statistics</name>
<class>eu.dnetlib.jobs.SparkComputeStatistics</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<class>eu.dnetlib.jobs.SparkCountVectorizer</class>
<jar>dnet-and-test-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
@ -159,7 +182,36 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--vocabularyPath</arg><arg>${vocabularyPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
</spark>
<ok to="LDATuning"/>
<error to="Kill"/>
</action>
<action name="LDATuning">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>LDA Tuning</name>
<class>eu.dnetlib.jobs.SparkLDATuning</class>
<jar>dnet-and-test-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--trainRatio</arg><arg>${trainRatio}</arg>
<arg>--numTopics</arg><arg>${numTopics}</arg>
<arg>--maxIterations</arg><arg>${maxIterations}</arg>
<arg>--outputModelPath</arg><arg>${outputModelPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
</spark>
@ -167,31 +219,5 @@
<error to="Kill"/>
</action>
<!--<action name="CreateDedupEntities">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Create Dedup Entities</name>-->
<!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
<!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
<!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
<!--</spark>-->
<!--<ok to="End"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<end name="End"/>
</workflow-app>

View File

@ -1,6 +1,7 @@
package eu.dnetlib.jobs;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
@ -8,6 +9,7 @@ import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;
@ -15,11 +17,14 @@ import java.nio.file.Paths;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class SparkJobsTest {
public class LDAAnalysisTest {
static SparkSession spark;
static JavaSparkContext context;
final static String workingPath = "/tmp/working_dir";
final static String tokensPath = workingPath + "/tokens";
final static String vocabularyPath = workingPath + "/vocabulary";
final static String bestLDAModelPath = workingPath + "/bestLDAmodel";
final static String numPartitions = "20";
final String inputDataPath = Paths
.get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI())
@ -27,12 +32,11 @@ public class SparkJobsTest {
.getAbsolutePath();
final static String inputFieldJPath = "$.description[0].value";
public SparkJobsTest() throws URISyntaxException {}
public LDAAnalysisTest() throws URISyntaxException {}
public static void cleanup() throws IOException {
//remove directories and clean workspace
//TODO add directories to be removed
//FileUtils.deleteDirectory(new File(path));
FileUtils.deleteDirectory(new File(workingPath));
}
@BeforeAll
@ -48,10 +52,10 @@ public class SparkJobsTest {
context = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
@AfterAll
public static void finalCleanUp() throws IOException {
cleanup();
}
// @AfterAll
// public static void finalCleanUp() throws IOException {
// cleanup();
// }
@Test
@Order(1)
@ -75,6 +79,69 @@ public class SparkJobsTest {
}
@Test
@Order(2)
public void createVocabularyTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class));
parser.parseArgument(
new String[] {
"-w", workingPath,
"-v", vocabularyPath,
"-vt", "file"
}
);
new SparkCreateVocabulary(
parser,
spark
).run();
}
@Test
@Order(3)
public void countVectorizeTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class));
parser.parseArgument(
new String[]{
"-w", workingPath,
"-v", vocabularyPath,
"-np", numPartitions
}
);
new SparkCountVectorizer(
parser,
spark
).run();
}
@Test
@Order(4)
public void ldaTuningTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class));
parser.parseArgument(
new String[]{
"-w", workingPath,
"-np", numPartitions,
"-tr", "0.8",
"-nt", "2,3,4,5",
"-mi", "5",
"-o", bestLDAModelPath
});
new SparkLDATuning(
parser,
spark
).run();
}
public static String readResource(String path, Class<? extends AbstractSparkJob> clazz) throws IOException {
return IOUtils.toString(clazz.getResourceAsStream(path));
}

448
pom.xml
View File

@ -1,17 +1,447 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-and</artifactId>
<version>1.0-SNAPSHOT</version>
<version>1.0.0-SNAPSHOT</version>
<packaging>pom</packaging>
<url>http://www.d-net.research-infrastructures.eu</url>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
<comments>A business-friendly OSS license</comments>
</license>
</licenses>
<scm>
<developerConnection>scm:git:https://code-repo.d4science.org/michele.debonis/dnet-and.git</developerConnection>
<tag>dnet-and-1.0.0</tag>
</scm>
<modules>
<module>dhp-build</module>
<module>dnet-feature-extraction</module>
<module>dnet-and-test</module>
</modules>
<issueManagement>
<system>Redmine</system>
<url>https://issue.openaire.research-infrastructures.eu/projects/openaire</url>
</issueManagement>
<distributionManagement>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
</repository>
</distributionManagement>
<repositories>
<repository>
<id>dnet-deps</id>
<name>dnet-dependencies</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet-deps</url>
<layout>default</layout>
</repository>
<repository>
<id>dnet45-releases</id>
<name>D-Net 45 Releases</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>dnet45-snapshots</id>
<name>D-Net 45 Snapshots</name>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>cloudera</id>
<name>Cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>ceon</id>
<name>Ceon Repository</name>
<url>https://maven.ceon.pl/artifactory/repo</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<build>
<directory>target</directory>
<outputDirectory>target/classes</outputDirectory>
<finalName>${project.artifactId}-${project.version}</finalName>
<testOutputDirectory>target/test-classes</testOutputDirectory>
<!--*************************************************-->
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>attach-sources</id>
<phase>verify</phase>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-jupiter.version}</version>
</dependency>
</dependencies>
<configuration>
<redirectTestOutputToFile>false</redirectTestOutputToFile>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
<configuration>
<detectLinks>true</detectLinks>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.13</version>
<executions>
<execution>
<id>integration-test</id>
<goals>
<goal>integration-test</goal>
</goals>
</execution>
<execution>
<id>verify</id>
<goals>
<goal>verify</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- <plugin>-->
<!-- <groupId>org.apache.maven.plugins</groupId>-->
<!-- <artifactId>maven-plugin-plugin</artifactId>-->
<!-- <version>3.7.1</version>-->
<!-- <configuration>-->
<!-- &lt;!&ndash; see http://jira.codehaus.org/browse/MNG-5346 &ndash;&gt;-->
<!-- <skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>-->
<!-- </configuration>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <id>mojo-descriptor</id>-->
<!-- <goals>-->
<!-- <goal>descriptor</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- </plugin>-->
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.5.3</version>
</plugin>
</plugins>
</build>
<pluginRepositories>
<pluginRepository>
<id>iis-releases</id>
<name>iis releases plugin repository</name>
<url>http://maven.ceon.pl/artifactory/iis-releases</url>
<layout>default</layout>
</pluginRepository>
</pluginRepositories>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<oozie.package.file.name>oozie-package</oozie.package.file.name>
<workflow.source.dir>src/test/resources/define/path/pointing/to/directory/holding/oozie_app</workflow.source.dir>
<oozieAppDir>oozie_app</oozieAppDir>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<google.gson.version>2.2.2</google.gson.version>
<google.guava.version>15.0</google.guava.version>
<spark.version>2.2.0</spark.version>
<sparknlp.version>2.5.5</sparknlp.version>
<jackson.version>2.6.5</jackson.version>
<mockito-core.version>3.3.3</mockito-core.version>
<commons.lang.version>3.5</commons.lang.version>
<commons.io.version>2.4</commons.io.version>
<commons.collections.version>3.2.1</commons.collections.version>
<commons.logging.version>1.1.3</commons.logging.version>
<junit.version>4.9</junit.version>
<scala.version>2.11.8</scala.version>
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
<!-- from dhp-workflows -->
<queueName>default</queueName>
<importerQueueName>default</importerQueueName>
<oozieLauncherQueueName>default</oozieLauncherQueueName>
<primed.dir>primed</primed.dir>
<oozie.package.dependencies.include.scope>runtime</oozie.package.dependencies.include.scope>
<oozie.package.dependencies.exclude.scope />
<oozie.package.skip.test.jar>true</oozie.package.skip.test.jar>
<dhpConnectionProperties>${user.home}/.dhp/application.properties</dhpConnectionProperties>
<output.dir.name>${maven.build.timestamp}</output.dir.name>
<projectVersion>${project.version}</projectVersion>
<oozie.use.system.libpath>true</oozie.use.system.libpath>
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
</properties>
</project>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
<version>3.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>${jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-jsonSchema</artifactId>
<version>${jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>${mockito-core.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>${mockito-core.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${google.guava.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${google.gson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>${commons.collections.version}</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>${commons.logging.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-jupiter.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.10</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
<artifactId>oozie-client</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.4.0</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.ibm.icu</groupId>-->
<!-- <artifactId>icu4j</artifactId>-->
<!-- <version>70.1</version>-->
<!-- </dependency>-->
</dependencies>
</dependencyManagement>
<profiles>
</profiles>
</project>