From d791840b8202a587ca5e7193e18d1ec9de576ebc Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 16 Dec 2022 13:18:29 +0100 Subject: [PATCH 1/7] [Clean Country] added test to verify remove of country: --- .../dhp/oa/graph/clean/CleanCountryTest.java | 39 +++++++++++++++++++ .../oa/graph/clean/dataset_clean_country.json | 0 2 files changed, 39 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java index c9f846570..c964a3e41 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import eu.dnetlib.dhp.schema.oaf.Dataset; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -147,4 +148,42 @@ public class CleanCountryTest { .size()); } + @Test + public void testDatasetClean() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json") + .getPath(); + + spark + .read() + .textFile(sourcePath) + .map( + (MapFunction) r -> OBJECT_MAPPER.readValue(r, Dataset.class), + Encoders.bean(Dataset.class)) + .write() + .json(workingDir.toString() + "/dataset"); + + CleanCountrySparkJob.main(new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--inputPath", workingDir.toString() + "/dataset", + "-graphTableClassName", Dataset.class.getCanonicalName(), + "-workingPath", workingDir.toString() + "/working", + "-country", "NL", + "-verifyParam", "10.17632", + "-collectedfrom", "NARCIS", + "-hostedBy", getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy") + .getPath() + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(1, tmp.count()); + + + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json new file mode 100644 index 000000000..e69de29bb From 8685eaa7064809e0ce13466b86973e76df353c49 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 16 Dec 2022 15:31:25 +0100 Subject: [PATCH 2/7] [Clean Country] added test to verify remove of country --- .../java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java | 4 +++- .../eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java index 651047b83..de9e4fc90 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanCountryTest.java @@ -167,7 +167,7 @@ public class CleanCountryTest { "--isSparkSessionManaged", Boolean.FALSE.toString(), "--inputPath", workingDir.toString() + "/dataset", "-graphTableClassName", Dataset.class.getCanonicalName(), - "-workingPath", workingDir.toString() + "/working", + "-workingDir", workingDir.toString() + "/working", "-country", "NL", "-verifyParam", "10.17632", "-collectedfrom", "NARCIS", @@ -183,6 +183,8 @@ public class CleanCountryTest { Assertions.assertEquals(1, tmp.count()); + Assertions.assertEquals(0, tmp.first().getCountry().size()); + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json index e69de29bb..f5c1fc334 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json @@ -0,0 +1 @@ +{"geolocation": [], "dataInfo": {"provenanceaction": {"classid": "sysimport:dedup", "classname": "sysimport:dedup", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "dedup-result-decisiontree-v3", "invisible": false, "trust": "0.8"}, "resourcetype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:dataCite_resource", "schemename": "dnet:dataCite_resource"}, "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.2"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.3"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.1"}], "contributor": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "somok bhowmik"}], "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "issued", "classname": "issued", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2022-08-16"}, {"qualifier": {"classid": "available", "classname": "available", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2023-08-23"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "UNKNOWN", "classname": "UNKNOWN", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2022-08-16"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "available", "classname": "available", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2022-08-16"}], "collectedfrom": [{"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, {"key": "10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", "value": "DANS (Data Archiving and Networked Services)"}, {"key": "10|eurocrisdris::fe4903425d9040f680d8610d9079ea14", "value": "NARCIS"}, {"key": "10|re3data_____::84e123776089ce3c7a33db98d9cd15a8", "value": "EASY"}], "id": "50|doi_dedup___::e04c8cbefb6f0b8378a04c57e6edbd82", "subject": [{"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Interdisciplinary sciences"}], "lastupdatetimestamp": 1670362508719, "author": [{"surname": "Bhowmik", "name": "S.", "pid": [], "rank": 1, "affiliation": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "via Mendeley Data"}], "fullname": "bhowmik, S"}], "instance": [{"refereed": {"classid": "0000", "classname": "Unknown", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::db814dc656a911b556dba42a331cebe9", "value": "Mendeley Data"}, "license": {"value": "https://creativecommons.org/licenses/by/4.0/legalcode"}, "url": ["https://dx.doi.org/10.17632/v6cgs4jpbk.2"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.2"}], "dateofacceptance": {"value": "2022-08-16"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "Unknown", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::db814dc656a911b556dba42a331cebe9", "value": "Mendeley Data"}, "url": ["https://dx.doi.org/10.17632/v6cgs4jpbk.3"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.3"}], "dateofacceptance": {"value": "2023-01-01"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "Unknown", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::db814dc656a911b556dba42a331cebe9", "value": "Mendeley Data"}, "url": ["https://dx.doi.org/10.17632/v6cgs4jpbk"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk"}], "dateofacceptance": {"value": "2023-01-01"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "Unknown", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::db814dc656a911b556dba42a331cebe9", "value": "Mendeley Data"}, "url": ["https://dx.doi.org/10.17632/v6cgs4jpbk.1"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.1"}], "dateofacceptance": {"value": "2023-01-01"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", "value": "DANS (Data Archiving and Networked Services)"}, "url": ["https://doi.org/10.17632/v6cgs4jpbk.2"], "pid": [], "alternateIdentifier": [{"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "urn", "classname": "urn", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "urn:nbn:nl:ui:13-q1-3218"}, {"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.2"}], "collectedfrom": {"key": "10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", "value": "DANS (Data Archiving and Networked Services)"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|eurocrisdris::fe4903425d9040f680d8610d9079ea14", "value": "NARCIS"}, "url": ["https://doi.org/10.17632/v6cgs4jpbk.2"], "pid": [], "alternateIdentifier": [{"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "urn", "classname": "urn", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "urn:nbn:nl:ui:13-q1-3218"}, {"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.2"}], "collectedfrom": {"key": "10|eurocrisdris::fe4903425d9040f680d8610d9079ea14", "value": "NARCIS"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}, {"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::84e123776089ce3c7a33db98d9cd15a8", "value": "EASY"}, "license": {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "http://creativecommons.org/licenses/by/4.0"}, "url": ["https://doi.org/10.17632/v6cgs4jpbk.2"], "pid": [], "alternateIdentifier": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "urn", "classname": "urn", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "urn:nbn:nl:ui:13-q1-3218"}, {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.17632/v6cgs4jpbk.2"}], "dateofacceptance": {"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "2022-01-01"}, "collectedfrom": {"key": "10|re3data_____::84e123776089ce3c7a33db98d9cd15a8", "value": "EASY"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}], "dateofcollection": "2022-08-17T20:06:53+0000", "fulltext": [], "dateoftransformation": "2022-08-17T20:06:53+0000", "description": [{"dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "Unprocessed images for: Acinetobacter baumannii defends against oxidative stress through a Mn2+-dependent small RNA-mediated repression of type VI secretion system"}], "format": [], "measures": [{"id": "influence", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "score", "value": "4.842839E-9"}, {"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "class", "value": "C"}]}, {"id": "popularity", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "score", "value": "7.705171E-9"}, {"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "class", "value": "C"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "score", "value": "0"}, {"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "class", "value": "C"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "score", "value": "0.0"}, {"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "class", "value": "C"}]}, {"id": "impulse", "unit": [{"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "score", "value": "0"}, {"dataInfo": {"provenanceaction": {"classid": "measure:bip", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "key": "class", "value": "C"}]}], "coverage": [], "externalReference": [], "publisher": {"value": "Mendeley"}, "context": [], "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemeid": "dnet:languages", "schemename": "dnet:languages"}, "resulttype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:result_typologies", "schemename": "dnet:result_typologies"}, "country": [{"classid": "NL", "classname": "Netherlands", "dataInfo": {"provenanceaction": {"classid": "country:instrepos", "classname": "Propagation of country to result collected from datasources of type institutional repositories", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "propagation", "invisible": false, "trust": "0.85"}, "schemeid": "dnet:countries", "schemename": "dnet:countries"}], "extraInfo": [], "originalId": ["50|datacite____::e04c8cbefb6f0b8378a04c57e6edbd82", "10.17632/v6cgs4jpbk.2", "50|datacite____::1544eab177367edbc4d9d56517d482df", "10.17632/v6cgs4jpbk.3", "50|datacite____::f02e8a3c923d8e120e1cdc5d2dcda3ff", "10.17632/v6cgs4jpbk", "50|datacite____::fce3b034f047504961bc4baab3515295", "10.17632/v6cgs4jpbk.1", "50|DansKnawCris::3b86948c475d0efbde049b72579feb50", "oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:254617", "50|dris___00893::3b86948c475d0efbde049b72579feb50", "oai:easy.dans.knaw.nl:easy-dataset:254617", "50|r384e1237760::673150ea2ae00b606fd8c39897dfa3d7"], "source": [], "dateofacceptance": {"value": "2022-08-16"}, "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"}, "value": "Unprocessed images for: Acinetobacter baumannii defends against oxidative stress through a Mn2+-dependent small RNA-mediated repression of type VI secretion system"}]} \ No newline at end of file From f37113a9419114d6481689a592341cef380c4d34 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 16 Dec 2022 15:32:26 +0100 Subject: [PATCH 3/7] [BulkTag] moving xquery to get community configuration in dedicated file --- .../community/QueryInformationSystem.java | 75 +++---------------- .../resources/eu/dnetlib/dhp/bulktag/query.xq | 58 ++++++++++++++ 2 files changed, 68 insertions(+), 65 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java index 678b270a0..5fe3cf81f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java @@ -1,8 +1,10 @@ package eu.dnetlib.dhp.bulktag.community; +import java.io.IOException; import java.util.List; +import org.apache.commons.io.IOUtils; import org.dom4j.DocumentException; import org.xml.sax.SAXException; @@ -13,74 +15,17 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class QueryInformationSystem { - private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " - + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() " - + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept " - + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept " - + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept " - + " let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text() " - + " let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text() " - + - "let $zenodo := $x//param[./@name='zenodoCommunity']/text() " - + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden' " - + " return " - + " " - + " { $x//CONFIGURATION/context/@id} " - + " " + - "{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }" + - "" - + " " - + " {for $y in tokenize($subj,',') " - + " return " - + " {$y}} " - + " {for $y in tokenize($fos,',') " - + " return " - + " {$y}} " - + " {for $y in tokenize($sdg,',') " - + " return " - + " {$y}} " - + " " - + " " - + " {for $d in $datasources " - + " where $d/param[./@name='enabled']/text()='true' " - + " return " - + " " - + " " - + " {$d//param[./@name='openaireId']/text()} " - + " " - + " " - + " {$d/param[./@name='selcriteria']/text()} " - + " " - + " } " - + " " + - " " + - "{for $zc in $zenodo " + - "return " + - " " + - " " + - "{$zc} " + - " " + - "}" - + " {for $zc in $communities " - + " return " - + " " - + " " - + " {$zc/param[./@name='zenodoid']/text()} " - + " " - + " " - + " {$zc/param[./@name='selcriteria']/text()} " - + " " - + " } " - + " " - + "" - + "{$x//CONFIGURATION/context/param[./@name='advancedConstraint']/text()} " - + "" - + " "; public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) - throws ISLookUpException, DocumentException, SAXException { + throws ISLookUpException, DocumentException, SAXException, IOException { ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - final List res = isLookUp.quickSearchProfile(XQUERY); + final List res = isLookUp + .quickSearchProfile( + IOUtils + .toString( + QueryInformationSystem.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/query.xq"))); final String xmlConf = "" + Joiner.on(" ").join(res) + ""; diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq index e69de29bb..6fbd74c8f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq @@ -0,0 +1,58 @@ +for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') +let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() +let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept +let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept +let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept +let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text() +let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text() +let $zenodo := $x//param[./@name='zenodoCommunity']/text() +where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden' +return + +{ $x//CONFIGURATION/context/@id} + +{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() } + + + {for $y in tokenize($subj,',') + return + {$y}} + {for $y in tokenize($fos,',') + return + {$y}} + {for $y in tokenize($sdg,',') + return + {$y}} + + + {for $d in $datasources + where $d/param[./@name='enabled']/text()='true' + return + + + {$d//param[./@name='openaireId']/text()} + + + {$d/param[./@name='selcriteria']/text()} + + } + + +{for $zc in $zenodo +return + + +{$zc} + +} +{for $zc in $communities +return + + +{$zc/param[./@name='zenodoid']/text()} + + +{$zc/param[./@name='selcriteria']/text()} + +} + \ No newline at end of file From 6674cccb9478d9a4d9cef15aa239c3593f8f39f2 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 16 Dec 2022 15:33:20 +0100 Subject: [PATCH 4/7] [BulkTag] description of parameters more comprehensive for those who do not implement it --- .../eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json index a37d7d168..a8be7c32e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json @@ -38,13 +38,13 @@ { "paramName": "test", "paramLongName": "isTest", - "paramDescription": "true if the spark session is managed, false otherwise", + "paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded", "paramRequired": false }, { "paramName": "tg", "paramLongName": "taggingConf", - "paramDescription": "true if the spark session is managed, false otherwise", + "paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed", "paramRequired": false } From ecd398fe51cefa9a7f55ff4695cf4ce03c4a4138 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 20 Jan 2023 14:23:45 +0100 Subject: [PATCH 5/7] refactoring --- .../src/main/java/eu/dnetlib/dhp/PropagationConstant.java | 6 +++--- .../resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 36361a09e..2139a7a75 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -231,9 +231,9 @@ public class PropagationConstant { if (HdfsSupport.exists(inputPath, spark.sparkContext().hadoopConfiguration())) { return spark - .read() - .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } else { return spark.emptyDataset(Encoders.bean(clazz)); } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 9c1bbdf72..d8f1946bb 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -219,7 +219,7 @@ - + From b254a0375f415508e5f3541e75f39b965c80b5cd Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 26 Jan 2023 16:51:20 +0100 Subject: [PATCH 6/7] [Affiliation from institutionalrepo] changed the field to check to verify the datasource type. Now it is in the field jurisdiction --- .../src/main/java/eu/dnetlib/dhp/PropagationConstant.java | 2 +- .../PrepareResultInstRepoAssociation.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 2139a7a75..89bdf0982 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -44,7 +44,7 @@ public class PropagationConstant { public final static String NULL = "NULL"; - public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; + public static final String INSTITUTIONAL_REPO_TYPE = "institutional"; public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 50ab997b6..1663afb32 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -102,7 +102,7 @@ public class PrepareResultInstRepoAssociation { String query = "SELECT source datasourceId, target organizationId " + "FROM ( SELECT id " + "FROM datasource " - + "WHERE datasourcetype.classid = '" + + "WHERE lower(jurisdiction.classid) = '" + INSTITUTIONAL_REPO_TYPE + "' " + "AND datainfo.deletedbyinference = false " + blacklisted + " ) d " From e82e009b46dff8449c3a62534aecef9ce1b0241a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 31 Jan 2023 10:19:34 +0100 Subject: [PATCH 7/7] added missing close tag for XML produced by the xquery to get information for the community from the IS --- .../src/main/resources/eu/dnetlib/dhp/bulktag/query.xq | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq index 6fbd74c8f..a9c0d9e3f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/query.xq @@ -55,4 +55,5 @@ return {$zc/param[./@name='selcriteria']/text()} } - \ No newline at end of file + + \ No newline at end of file