dnet-and/dnet-and-test/src/test/java/eu/dnetlib/jobs/featureextraction/FeatureExtractionJobTest.java

package eu.dnetlib.jobs.featureextraction;

import eu.dnetlib.jobs.AbstractSparkJob;
import eu.dnetlib.jobs.SparkTokenizer;
import eu.dnetlib.support.ArgumentApplicationParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;

import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Paths;

@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class FeatureExtractionJobTest {
    static SparkSession spark;
    static JavaSparkContext context;
    final static String workingPath = "/tmp/working_dir";

    final String inputDataPath = Paths
            .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications.subset.json").toURI())
            .toFile()
            .getAbsolutePath();

    final String ldaTopicsPath = Paths
            .get(getClass().getResource("/eu/dnetlib/jobs/examples/publications_lda_topics_subset").toURI())
            .toFile()
            .getAbsolutePath();

    public FeatureExtractionJobTest() throws URISyntaxException {}

    public static void cleanup() throws IOException {
        //remove directories and clean workspace
        FileUtils.deleteDirectory(new File(workingPath));
    }

    @BeforeAll
    public void setup() throws IOException {
        cleanup();

        spark = SparkSession
                .builder()
                .appName("Testing")
                .master("local[*]")
                .getOrCreate();

        context = JavaSparkContext.fromSparkContext(spark.sparkContext());
    }

    @AfterAll
    public static void finalCleanUp() throws IOException {
        cleanup();
    }

    @Test
    @Order(1)
    public void publicationFeatureExtractionTest() throws Exception {
        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/publicationFeatureExtractor_parameters.json", SparkTokenizer.class));

        parser.parseArgument(
                new String[] {
                        "-p", inputDataPath,
                        "-w", workingPath,
                        "-np", "20"
                }
        );

        new SparkPublicationFeatureExtractor(
                parser,
                spark
        ).run();
    }

    @Test
    @Order(2)
    public void authorExtractionTest() throws Exception {
        ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/authorExtractor_parameters.json", SparkAuthorExtractor.class));

        parser.parseArgument(
                new String[]{
                        "-p", inputDataPath,
                        "-w", workingPath,
                        "-np", "20",
                        "-t", ldaTopicsPath,
                        "-f", workingPath + "/publication_features",
                        "-o", workingPath + "/authors"
                });

        new SparkAuthorExtractor(
                parser,
                spark
        ).run();

    }

    public static String readResource(String path, Class<? extends AbstractSparkJob> clazz) throws IOException {
        return IOUtils.toString(clazz.getResourceAsStream(path));
    }
}