2020-04-27 14:52:31 +02:00
|
|
|
|
2019-04-11 15:39:29 +02:00
|
|
|
package eu.dnetlib.dhp.transformation;
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
2020-04-18 12:42:58 +02:00
|
|
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
2021-01-27 15:43:08 +01:00
|
|
|
import static org.mockito.Mockito.lenient;
|
2020-04-18 12:42:58 +02:00
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
import java.io.IOException;
|
2020-04-18 12:42:58 +02:00
|
|
|
import java.io.StringWriter;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
2021-01-27 15:43:08 +01:00
|
|
|
import java.util.*;
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
import java.util.stream.Stream;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
import javax.xml.transform.stream.StreamSource;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2019-04-11 15:39:29 +02:00
|
|
|
import org.apache.commons.io.IOUtils;
|
2021-01-28 09:51:17 +01:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2020-05-05 12:39:04 +02:00
|
|
|
import org.apache.spark.SparkConf;
|
2021-01-28 09:51:17 +01:00
|
|
|
import org.apache.spark.api.java.function.FilterFunction;
|
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.Encoder;
|
|
|
|
import org.apache.spark.sql.Encoders;
|
2020-05-05 12:39:04 +02:00
|
|
|
import org.apache.spark.sql.SparkSession;
|
2019-04-11 15:39:29 +02:00
|
|
|
import org.apache.spark.util.LongAccumulator;
|
|
|
|
import org.dom4j.Document;
|
|
|
|
import org.dom4j.Node;
|
|
|
|
import org.dom4j.io.SAXReader;
|
2021-01-27 15:43:08 +01:00
|
|
|
import org.junit.jupiter.api.*;
|
2020-03-25 17:59:45 +01:00
|
|
|
import org.junit.jupiter.api.extension.ExtendWith;
|
|
|
|
import org.junit.jupiter.api.io.TempDir;
|
2019-04-11 15:39:29 +02:00
|
|
|
import org.mockito.Mock;
|
2020-03-25 17:59:45 +01:00
|
|
|
import org.mockito.junit.jupiter.MockitoExtension;
|
2021-01-28 09:51:17 +01:00
|
|
|
|
|
|
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
2020-05-05 12:39:04 +02:00
|
|
|
import eu.dnetlib.dhp.collection.CollectionJobTest;
|
2021-01-28 09:51:17 +01:00
|
|
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
2020-04-28 11:23:29 +02:00
|
|
|
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
2021-01-28 09:51:17 +01:00
|
|
|
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
2021-02-02 12:12:14 +01:00
|
|
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
2021-01-28 09:51:17 +01:00
|
|
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
|
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2020-03-25 17:59:45 +01:00
|
|
|
@ExtendWith(MockitoExtension.class)
|
2019-04-11 15:39:29 +02:00
|
|
|
public class TransformationJobTest {
|
2019-11-04 17:41:01 +01:00
|
|
|
|
2020-05-05 12:39:04 +02:00
|
|
|
private static SparkSession spark;
|
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
@Mock
|
|
|
|
private ISLookUpService isLookUpService;
|
|
|
|
|
|
|
|
private VocabularyGroup vocabularies;
|
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() throws ISLookUpException, IOException {
|
|
|
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
|
|
|
|
|
|
|
lenient()
|
2021-01-28 09:51:17 +01:00
|
|
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
|
|
|
.thenReturn(synonyms());
|
2021-01-27 15:43:08 +01:00
|
|
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
|
|
|
}
|
|
|
|
|
2020-05-05 12:39:04 +02:00
|
|
|
@BeforeAll
|
|
|
|
public static void beforeAll() {
|
|
|
|
SparkConf conf = new SparkConf();
|
|
|
|
conf.setAppName(CollectionJobTest.class.getSimpleName());
|
|
|
|
conf.setMaster("local");
|
|
|
|
spark = SparkSession.builder().config(conf).getOrCreate();
|
|
|
|
}
|
|
|
|
|
|
|
|
@AfterAll
|
|
|
|
public static void afterAll() {
|
|
|
|
spark.stop();
|
|
|
|
}
|
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
@Test
|
2021-01-27 15:43:08 +01:00
|
|
|
@DisplayName("Test Transform Single XML using XSLTTransformator")
|
2020-04-27 14:52:31 +02:00
|
|
|
public void testTransformSaxonHE() throws Exception {
|
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
// We Set the input Record getting the XML from the classpath
|
|
|
|
final MetadataRecord mr = new MetadataRecord();
|
|
|
|
mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml")));
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
// We Load the XSLT transformation Rule from the classpath
|
2021-01-27 15:43:08 +01:00
|
|
|
XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/ext_simple.xsl");
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
// Print the record
|
2021-01-27 15:43:08 +01:00
|
|
|
System.out.println(tr.call(mr).getBody());
|
2021-01-28 09:51:17 +01:00
|
|
|
// TODO Create significant Assert
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@DisplayName("Test TransformSparkJobNode.main")
|
|
|
|
@Test
|
|
|
|
public void transformTest(@TempDir Path testDir) throws Exception {
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
|
|
|
final String mdstore_output = testDir.toString() + "/version";
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl");
|
2021-01-27 15:43:08 +01:00
|
|
|
|
|
|
|
// final String arguments = "-issm true -i %s -o %s -d 1 -w 1 -tp XSLT_TRANSFORM -tr simpleTRule";
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
final Map<String, String> parameters = Stream.of(new String[][] {
|
|
|
|
{
|
|
|
|
"dateOfTransformation", "1234"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"transformationPlugin", "XSLT_TRANSFORM"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"transformationRuleTitle", "simpleTRule"
|
|
|
|
},
|
2021-01-27 15:43:08 +01:00
|
|
|
|
|
|
|
}).collect(Collectors.toMap(data -> data[0], data -> data[1]));
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output);
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
// TODO introduce useful assertions
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
|
|
|
|
final Dataset<MetadataRecord> mOutput = spark.read().format("parquet").load(mdstore_output).as(encoder);
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
final Long total = mOutput.count();
|
|
|
|
|
|
|
|
final long recordTs = mOutput
|
|
|
|
.filter((FilterFunction<MetadataRecord>) p -> p.getDateOfTransformation() == 1234)
|
|
|
|
.count();
|
|
|
|
|
|
|
|
final long recordNotEmpty = mOutput
|
|
|
|
.filter((FilterFunction<MetadataRecord>) p -> !StringUtils.isBlank(p.getBody()))
|
|
|
|
.count();
|
|
|
|
|
|
|
|
assertEquals(total, recordTs);
|
|
|
|
|
|
|
|
assertEquals(total, recordNotEmpty);
|
2020-05-05 12:39:04 +02:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void tryLoadFolderOnCP() throws Exception {
|
|
|
|
final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
|
|
|
System.out.println("path = " + path);
|
|
|
|
|
|
|
|
Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output");
|
|
|
|
|
|
|
|
System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());
|
|
|
|
|
|
|
|
Files.deleteIfExists(tempDirWithPrefix);
|
|
|
|
}
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
private void mockupTrasformationRule(final String trule, final String path) throws Exception {
|
2021-01-27 15:43:08 +01:00
|
|
|
final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path));
|
2020-04-27 14:52:31 +02:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
lenient()
|
|
|
|
.when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule)))
|
|
|
|
.thenReturn(Collections.singletonList(trValue));
|
2021-01-27 15:43:08 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception {
|
|
|
|
final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path));
|
|
|
|
final LongAccumulator la = new LongAccumulator();
|
2021-01-28 09:51:17 +01:00
|
|
|
return new XSLTTransformationFunction(new AggregationCounter(la, la, la), trValue, 0, vocabularies);
|
2021-01-27 15:43:08 +01:00
|
|
|
}
|
2020-04-27 14:52:31 +02:00
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
private List<String> vocs() throws IOException {
|
|
|
|
return IOUtils
|
2021-01-28 09:51:17 +01:00
|
|
|
.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt"));
|
2021-01-27 15:43:08 +01:00
|
|
|
}
|
2020-04-27 14:52:31 +02:00
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
private List<String> synonyms() throws IOException {
|
|
|
|
return IOUtils
|
2021-01-28 09:51:17 +01:00
|
|
|
.readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt"));
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-04-11 15:39:29 +02:00
|
|
|
}
|