From fa7930d2e2c4aeda1ee42018be065826367dc96e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Mar 2021 15:45:28 +0100 Subject: [PATCH] merging contributions from PR#97 --- .gitignore | 3 + .../common/vocabulary/VocabularyGroup.java | 24 + .../transformation/TransformationJobTest.java | 120 +- .../eu/dnetlib/dhp/transform/input_itgv4.xml | 70 ++ .../xslt_cleaning_datarepo_datacite.xsl | 432 +++++++ .../xslt_cleaning_datarepo_datacite_orig.xsl | 472 +++++++ .../scripts/xslt_cleaning_oaiOpenaire.xsl | 82 ++ ...enaire_datacite_ExchangeLandingpagePid.xsl | 791 ++++++++++++ ...e_datacite_ExchangeLandingpagePid_orig.xsl | 1081 +++++++++++++++++ .../dhp/transform/scripts/zenodo_tr.xsl | 451 +++++++ dhp-workflows/dhp-graph-mapper/pom.xml | 6 +- pom.xml | 28 +- 12 files changed, 3545 insertions(+), 15 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl diff --git a/.gitignore b/.gitignore index 2d7730711..f4fb46f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ *.iws *~ .vscode +.metals +.bloop .classpath /*/.classpath /*/*/.classpath @@ -24,4 +26,5 @@ spark-warehouse /**/job-override.properties /**/*.log +/**/.factorypath diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index f81181e53..12c6279e5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -122,7 +122,31 @@ public class VocabularyGroup implements Serializable { return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); } + /** + * getSynonymAsQualifierCaseSensitive + * + * refelects the situation to check caseSensitive vocabulary + */ + public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + return vocs.get(vocId).getSynonymAsQualifier(syn); + } + + /** + * termExists + * + * two methods: without and with caseSensitive check + */ public boolean termExists(final String vocId, final String id) { + return termExists(vocId, id, Boolean.FALSE); + } + + public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) { + if (Boolean.TRUE.equals(caseSensitive)) { + return vocabularyExists(vocId) && vocs.get(vocId).termExists(id); + } return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index e29a8ac50..62a5223d9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -51,13 +51,29 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform Single XML using XSLTTransformator") + @DisplayName("Test Transform Single XML using zenodo_tr XSLTTransformator") public void testTransformSaxonHE() throws Exception { // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); + + MetadataRecord result = tr.call(mr); + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test Transform Inst.&Them.v4 record XML with zenodo_tr") + public void testTransformITGv4Zenodo() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); // We Load the XSLT transformation Rule from the classpath XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); @@ -68,6 +84,108 @@ public class TransformationJobTest extends AbstractVocabularyTest { // TODO Create significant Assert } + @Test + @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") + public void testTransformITGv4() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") + public void testTransformMostlyUsedScript() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test TransformSparkJobNode.main with oaiOpenaire_datacite (v4)") + public void transformTestITGv4OAIdatacite(@TempDir Path testDir) throws Exception { + + SparkConf conf = new SparkConf(); + conf.setAppName(TransformationJobTest.class.getSimpleName()); + conf.setMaster("local"); + + try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + + final String mdstore_input = this + .getClass() + .getResource("/eu/dnetlib/dhp/transform/mdstorenative") + .getFile(); + final String mdstore_output = testDir.toString() + "/version"; + + mockupTrasformationRule( + "simpleTRule", + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "varOfficialName", "Publications at Bielefeld University" + }, + { + "varOfficialId", "opendoar____::2294" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); + + // TODO introduce useful assertions + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + } + } + @Test @DisplayName("Test TransformSparkJobNode.main") public void transformTest(@TempDir Path testDir) throws Exception { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml new file mode 100644 index 000000000..06325810b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml @@ -0,0 +1,70 @@ + + + + + od______2294::0000955eab68583ba0e07e973dd48708 + oai:pub.uni-bielefeld.de:1997560 + 2021-02-23T13:14:00.839Z + od______2294 + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + journal_article + doc-type:article + + + + Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit + + + Schulz, Raimund + + + + https://pub.uni-bielefeld.de/record/1997560.json + + + 0016-9056 + + ger + Friedrich + 2002 + journal article + https://pub.uni-bielefeld.de/record/1997560 + metadata only access + Schulz R. Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit. GWU. 2002;53(5-/6):340-360. + + In Copyright + GWU + 53 + 5-/6 + + + + + + http%3A%2F%2Fpub.uni-bielefeld.de%2Foai + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + + + + + false + false + 0.9 + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl new file mode 100644 index 000000000..f815c0260 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl @@ -0,0 +1,432 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl new file mode 100644 index 000000000..d8b14fadd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl @@ -0,0 +1,472 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl new file mode 100644 index 000000000..53a3466a9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl new file mode 100644 index 000000000..56451505e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl @@ -0,0 +1,791 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl new file mode 100644 index 000000000..3cfaec80b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl @@ -0,0 +1,1081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl new file mode 100644 index 000000000..0c3f4b1f9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl @@ -0,0 +1,451 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + CLOSED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 5e8448182..81d93f97b 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -33,6 +33,10 @@ + + -Xmax-classfile-name + 140 + ${scala.version} @@ -67,7 +71,7 @@ test - org.apache.httpcomponents + org.apache.httpcomponents httpclient diff --git a/pom.xml b/pom.xml index bef649c67..45bb6bf78 100644 --- a/pom.xml +++ b/pom.xml @@ -362,7 +362,7 @@ ${dnet.openaire.broker.common} - + org.apache.cxf cxf-rt-transports-http 3.1.5 @@ -406,20 +406,20 @@ 4.0 - - com.ximpleware - vtd-xml - ${vtd.version} - + + com.ximpleware + vtd-xml + ${vtd.version} + - - org.elasticsearch - elasticsearch-hadoop - 7.6.0 - + + org.elasticsearch + elasticsearch-hadoop + 7.6.0 + - + org.apache.oozie oozie-client ${dhp.oozie.version} @@ -685,6 +685,8 @@ UTF-8 UTF-8 3.6.0 + 1.8 + 1.8 2.22.2 2.0.1 cdh5.9.2 @@ -711,4 +713,4 @@ 4.5.3 4.0.1 - + \ No newline at end of file