dnet-and/dnet-and-test/src/test/java/eu/dnetlib/jobs/LDAAnalysisTest.java

package eu.dnetlib.jobs;

import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;

@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class LDAAnalysisTest {

    static SparkSession spark;
    static JavaSparkContext context;
    final static String workingPath = "/tmp/working_dir";
    final static String tokensPath = workingPath + "/tokens";
    final static String vocabularyPath = workingPath + "/vocabulary";
    final static String bestLDAModelPath = workingPath + "/bestLDAmodel";
    final static String numPartitions = "20";
    final String inputDataPath = Paths
            .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI())
            .toFile()
            .getAbsolutePath();
    final static String inputFieldJPath = "$.description[0].value";

    public LDAAnalysisTest() throws URISyntaxException {}

    public static void cleanup() throws IOException {
        //remove directories and clean workspace
        FileUtils.deleteDirectory(new File(workingPath));
    }

    @BeforeAll
    public void setup() throws IOException {
        cleanup();

        spark = SparkSession
                .builder()
                .appName("Testing")
                .master("local[*]")
                .getOrCreate();

        context = JavaSparkContext.fromSparkContext(spark.sparkContext());
    }

//    @AfterAll
//    public static void finalCleanUp() throws IOException {
//        cleanup();
//    }

    @Test
    @Order(1)
    public void tokenizerTest() throws Exception {

        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/tokenizer_parameters.json", SparkTokenizer.class));

        parser.parseArgument(
                new String[] {
                        "-i", inputDataPath,
                        "-f", inputFieldJPath,
                        "-w", workingPath,
                        "-np", numPartitions
                }
        );

        new SparkTokenizer(
                parser,
                spark
        ).run();

    }

    @Test
    @Order(2)
    public void createVocabularyTest() throws Exception {

        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createVocabulary_parameters.json", SparkTokenizer.class));

        parser.parseArgument(
                new String[] {
                        "-w", workingPath,
                        "-v", vocabularyPath,
                        "-vt", "file"
                }
        );

        new SparkCreateVocabulary(
                parser,
                spark
        ).run();
    }

    @Test
    @Order(3)
    public void countVectorizeTest() throws Exception {

        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/countVectorizer_parameters.json", SparkTokenizer.class));

        parser.parseArgument(
                new String[]{
                        "-w", workingPath,
                        "-v", vocabularyPath,
                        "-np", numPartitions
                }
        );

        new SparkCountVectorizer(
                parser,
                spark
        ).run();

    }

    @Test
    @Order(4)
    public void ldaTuningTest() throws Exception {
        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/ldaTuning_parameters.json", SparkTokenizer.class));

        parser.parseArgument(
                new String[]{
                        "-w", workingPath,
                        "-np", numPartitions,
                        "-tr", "0.8",
                        "-nt", "2,3,4,5",
                        "-mi", "5",
                        "-o", bestLDAModelPath
                });

        new SparkLDATuning(
                parser,
                spark
        ).run();

    }

    public static String readResource(String path, Class<? extends AbstractSparkJob> clazz) throws IOException {
        return IOUtils.toString(clazz.getResourceAsStream(path));
    }

}