150 lines
4.5 KiB
Java
150 lines
4.5 KiB
Java
package eu.dnetlib.jobs;
|
|
|
|
import eu.dnetlib.support.ArgumentApplicationParser;
|
|
import org.apache.commons.io.FileUtils;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
import org.apache.spark.sql.SparkSession;
|
|
import org.junit.jupiter.api.*;
|
|
import org.junit.jupiter.api.extension.ExtendWith;
|
|
import org.mockito.junit.jupiter.MockitoExtension;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.net.URISyntaxException;
|
|
import java.nio.file.Paths;
|
|
|
|
@ExtendWith(MockitoExtension.class)
|
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
|
public class LDAAnalysisTest {
|
|
|
|
static SparkSession spark;
|
|
static JavaSparkContext context;
|
|
final static String workingPath = "/tmp/working_dir";
|
|
final static String tokensPath = workingPath + "/tokens";
|
|
final static String vocabularyPath = workingPath + "/vocabulary";
|
|
final static String bestLDAModelPath = workingPath + "/bestLDAmodel";
|
|
final static String numPartitions = "20";
|
|
final String inputDataPath = Paths
|
|
.get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI())
|
|
.toFile()
|
|
.getAbsolutePath();
|
|
final static String inputFieldJPath = "$.description[0].value";
|
|
|
|
public LDAAnalysisTest() throws URISyntaxException {}
|
|
|
|
public static void cleanup() throws IOException {
|
|
//remove directories and clean workspace
|
|
FileUtils.deleteDirectory(new File(workingPath));
|
|
}
|
|
|
|
@BeforeAll
|
|
public void setup() throws IOException {
|
|
cleanup();
|
|
|
|
spark = SparkSession
|
|
.builder()
|
|
.appName("Testing")
|
|
.master("local[*]")
|
|
.getOrCreate();
|
|
|
|
context = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
}
|
|
|
|
// @AfterAll
|
|
// public static void finalCleanUp() throws IOException {
|
|
// cleanup();
|
|
// }
|
|
|
|
@Test
|
|
@Order(1)
|
|
public void tokenizerTest() throws Exception {
|
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/tokenizer_parameters.json", SparkTokenizer.class));
|
|
|
|
parser.parseArgument(
|
|
new String[] {
|
|
"-i", inputDataPath,
|
|
"-f", inputFieldJPath,
|
|
"-w", workingPath,
|
|
"-np", numPartitions
|
|
}
|
|
);
|
|
|
|
new SparkTokenizer(
|
|
parser,
|
|
spark
|
|
).run();
|
|
|
|
}
|
|
|
|
@Test
|
|
@Order(2)
|
|
public void createVocabularyTest() throws Exception {
|
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class));
|
|
|
|
parser.parseArgument(
|
|
new String[] {
|
|
"-w", workingPath,
|
|
"-v", vocabularyPath,
|
|
"-vt", "file"
|
|
}
|
|
);
|
|
|
|
new SparkCreateVocabulary(
|
|
parser,
|
|
spark
|
|
).run();
|
|
}
|
|
|
|
@Test
|
|
@Order(3)
|
|
public void countVectorizeTest() throws Exception {
|
|
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class));
|
|
|
|
parser.parseArgument(
|
|
new String[]{
|
|
"-w", workingPath,
|
|
"-v", vocabularyPath,
|
|
"-np", numPartitions
|
|
}
|
|
);
|
|
|
|
new SparkCountVectorizer(
|
|
parser,
|
|
spark
|
|
).run();
|
|
|
|
}
|
|
|
|
@Test
|
|
@Order(4)
|
|
public void ldaTuningTest() throws Exception {
|
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class));
|
|
|
|
parser.parseArgument(
|
|
new String[]{
|
|
"-w", workingPath,
|
|
"-np", numPartitions,
|
|
"-tr", "0.8",
|
|
"-nt", "2,3,4,5",
|
|
"-mi", "5",
|
|
"-o", bestLDAModelPath
|
|
});
|
|
|
|
new SparkLDATuning(
|
|
parser,
|
|
spark
|
|
).run();
|
|
|
|
}
|
|
|
|
public static String readResource(String path, Class<? extends AbstractSparkJob> clazz) throws IOException {
|
|
return IOUtils.toString(clazz.getResourceAsStream(path));
|
|
}
|
|
|
|
}
|