diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java index 75d2106..8b92bb2 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscInteroperabilityFramework.java @@ -55,4 +55,13 @@ public class EoscInteroperabilityFramework implements Serializable { this.semanticRelation = semanticRelation; } + public static EoscInteroperabilityFramework newInstance(String code, String label, String url, + String semanticRelation) { + EoscInteroperabilityFramework eif = new EoscInteroperabilityFramework(); + eif.label = label; + eif.code = code; + eif.url = url; + eif.semanticRelation = semanticRelation; + return eif; + } } diff --git a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java index 2cec568..1bbc675 100644 --- a/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java +++ b/dump-schema/src/main/java/eu/dnetlib/dhp/eosc/model/EoscResult.java @@ -18,6 +18,6 @@ public class EoscResult extends GraphResult { } public void setEoscIF(EoscInteroperabilityFramework eoscIF) { - eoscIF = eoscIF; + this.eoscIF = eoscIF; } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index 9d28a1d..8d4035b 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -6,9 +6,15 @@ import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.eosc.model.EoscInteroperabilityFramework; import eu.dnetlib.dhp.eosc.model.EoscResult; +import eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1; import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException; import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException; import eu.dnetlib.dhp.oa.model.*; @@ -28,6 +34,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class ResultMapper implements Serializable { + private static final Logger log = LoggerFactory.getLogger(ResultMapper.class); public static Result map( E in, Map communityMap, String dumpType) @@ -150,6 +157,10 @@ public class ResultMapper implements Serializable { ((GraphResult) out) .setInstance( oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList())); + } else if (Constants.DUMPTYPE.EOSC.getType().equals(dumpType)) { + ((EoscResult) out) + .setInstance( + oInst.get().stream().map(ResultMapper::getGraphInstance).collect(Collectors.toList())); } else { ((CommunityResult) out) .setInstance( @@ -237,12 +248,15 @@ public class ResultMapper implements Serializable { "EOSC IF in the result has cardinality greater than one. Change dump!"); } if (gei.size() == 1) { - EoscInteroperabilityFramework eif = new EoscInteroperabilityFramework(); - eif.setCode(gei.get(0).getCode()); - eif.setLabel(gei.get(0).getLabel()); - eif.setUrl(gei.get(0).getUrl()); - eif.setSemanticRelation(gei.get(0).getSemanticRelation()); - ((EoscResult) out).setEoscIF(eif); + + EoscIfGuidelines ifra = gei.get(0); + ((EoscResult) out) + .setEoscIF( + EoscInteroperabilityFramework + .newInstance( + ifra.getCode(), ifra.getLabel(), ifra.getUrl(), + ifra.getSemanticRelation())); + } } } else if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) { diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java index 75d7e7a..304d891 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/eosc/SelectEoscResultsJobStep1.java @@ -17,6 +17,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.eosc.model.EoscResult; import eu.dnetlib.dhp.oa.graph.dump.Constants; import eu.dnetlib.dhp.oa.graph.dump.ResultMapper; import eu.dnetlib.dhp.oa.graph.dump.Utils; @@ -76,9 +77,9 @@ public class SelectEoscResultsJobStep1 implements Serializable { (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible() && r.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) .map( - (MapFunction) r -> (GraphResult) ResultMapper - .map(r, null, Constants.DUMPTYPE.COMPLETE.getType()), - Encoders.bean(GraphResult.class)) + (MapFunction) r -> (EoscResult) ResultMapper + .map(r, null, Constants.DUMPTYPE.EOSC.getType()), + Encoders.bean(EoscResult.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json index bc3e0cd..a59a5ce 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eosc_select_result_parameters.json @@ -1,10 +1,5 @@ [ - { - "paramName":"cmp", - "paramLongName":"communityMapPath", - "paramDescription": "the path to the serialization of the community map", - "paramRequired": true - }, + { "paramName":"s", "paramLongName":"sourcePath", diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml index 449d659..de85e94 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/eoscdump/oozie_app/workflow.xml @@ -117,7 +117,7 @@ --sourcePath${sourcePath}/publication --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${workingDir}/tar/publication - --communityMapPathnoneed + @@ -142,7 +142,7 @@ --sourcePath${sourcePath}/dataset --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${workingDir}/tar/dataset - --communityMapPathnoneed + @@ -167,7 +167,7 @@ --sourcePath${sourcePath}/otherresearchproduct --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${workingDir}/tar/otherresearchproduct - --communityMapPathnoneed + @@ -192,7 +192,7 @@ --sourcePath${sourcePath}/software --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${workingDir}/tar/software - --communityMapPathnoneed + diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java index 1c8eca5..34da999 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java @@ -25,7 +25,9 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; +import eu.dnetlib.dhp.eosc.model.EoscResult; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; +import eu.dnetlib.dhp.oa.graph.dump.eosc.SelectEoscResultsJobStep1; import eu.dnetlib.dhp.oa.model.Instance; import eu.dnetlib.dhp.oa.model.OpenAccessRoute; import eu.dnetlib.dhp.oa.model.community.CommunityResult; @@ -881,6 +883,47 @@ public class DumpJobTest { } + @Test + public void testEOSCDump() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json") + .getPath(); + + final String communityMapPath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json") + .getPath(); + + SelectEoscResultsJobStep1 + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", + sourcePath, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/working" + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/working") + .map(item -> OBJECT_MAPPER.readValue(item, EoscResult.class)); + + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(EoscResult.class)); + + Assertions.assertEquals(1, verificationDataset.count()); + + Assertions.assertEquals(1, verificationDataset.filter("type = 'dataset'").count()); + + Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getCode().equals("EOSC::Twitter Data")).count()); + Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getLabel().equals("EOSC::Twitter Data")).count()); + Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getUrl().equals("")).count()); + Assertions.assertEquals(1, tmp.filter(d -> d.getEoscIF().getSemanticRelation().equals("compliesWith")).count()); + + } + @Test public void testArticlePCA() { final String sourcePath = getClass() diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json new file mode 100644 index 0000000..cd17fa7 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/eosc/eosctag.json @@ -0,0 +1 @@ +{"geolocation": [], "dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "resourcetype": {"classid": "UNKNOWN", "classname": "Unknown", "schemeid": "dnet:dataCite_resource", "schemename": "dnet:dataCite_resource"}, "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.21227/mqmt-yq28"}], "contributor": [], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "issued", "classname": "issued", "schemeid": "dnet:dataCite_date", "schemename": "dnet:dataCite_date"}, "value": "2020-11-21"}], "collectedfrom": [{"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}], "id": "50|doi_________::bbf3a8925017a575215fc7be77cab114", "subject": [{"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Machine Learning"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Corona Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "SARS-CoV-2 Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "SARS-CoV-2 Twitter Sentiment"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Coronavirus English Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 English Tweets Dataset"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "Coronavirus Geotagged Tweets"}, {"qualifier": {"classid": "keyword", "classname": "keyword", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "COVID-19 Geotagged Tweets"}, {"dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_classes", "invisible": false, "trust": "0.891"}, "qualifier": {"classid": "ACM", "classname": "ACM Computing Classification System", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "InformationSystems_MISCELLANEOUS"}, {"dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_classes", "invisible": false, "trust": "0.8226"}, "qualifier": {"classid": "ACM", "classname": "ACM Computing Classification System", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies"}, "value": "InformationSystems_INFORMATIONSTORAGEANDRETRIEVAL"}], "lastupdatetimestamp": 1657046634922, "author": [{"surname": "Lamsal", "name": "Rabindra", "pid": [], "rank": 1, "affiliation": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "value": "School of Computer and Systems Sciences, JN"}], "fullname": "Lamsal, Rabindra"}], "instance": [{"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemeid": "dnet:review_levels", "schemename": "dnet:review_levels"}, "hostedby": {"key": "10|re3data_____::3bc31eb6c47d0134a1ac576dc028c3b9", "value": "IEEE DataPort"}, "license": {"value": "https://creativecommons.org/licenses/by/4.0/legalcode"}, "url": ["https://dx.doi.org/10.21227/mqmt-yq28"], "pid": [{"dataInfo": {"invisible": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9", "inferred": false, "deletedbyinference": false}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemeid": "dnet:pid_types", "schemename": "dnet:pid_types"}, "value": "10.21227/mqmt-yq28"}], "dateofacceptance": {"value": "2020-11-21"}, "collectedfrom": {"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254", "value": "Datacite"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemeid": "dnet:access_modes", "schemename": "dnet:access_modes"}, "instancetype": {"classid": "0021", "classname": "Dataset", "schemeid": "dnet:publication_resource", "schemename": "dnet:publication_resource"}}], "dateofcollection": "2020-11-21T04:46:12+0000", "fulltext": [], "dateoftransformation": "2020-11-21T04:46:12+0000", "description": [{"value": "This dataset contains IDs and sentiment scores of the geo-tagged tweets related to the COVID-19 pandemic. The tweets are captured by an on-going project deployed at https://live.rlamsal.com.np. The model monitors the real-time Twitter feed for coronavirus-related tweets using 90+ different keywords and hashtags that are commonly used while referencing the pandemic. Complying with Twitter's content redistribution policy, only the tweet IDs are shared. You can re-construct the dataset by hydrating these IDs. The tweet IDs in this dataset belong to the tweets tweeted providing an exact location.The paper associated with this dataset is available here: Design and analysis of a large-scale COVID-19 tweets dataset"}], "format": [], "measures": [], "coverage": [], "externalReference": [], "publisher": {"value": "IEEE DataPort"}, "context": [{"dataInfo": [{"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_covid19", "invisible": false, "trust": "0.9"}, {"provenanceaction": {"classid": "community:subject", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}, {"provenanceaction": {"classid": "community:datasource", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}], "id": "covid-19"}, {"dataInfo": [{"provenanceaction": {"classid": "community:datasource", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "bulktagging", "invisible": false, "trust": "0.8"}], "id": "eosc"}], "eoscifguidelines": [{"semanticRelation": "compliesWith", "url": "", "code": "EOSC::Twitter Data", "label": "EOSC::Twitter Data"}], "language": {"classid": "und", "classname": "Undetermined", "schemeid": "dnet:languages", "schemename": "dnet:languages"}, "resulttype": {"classid": "dataset", "classname": "dataset", "schemeid": "dnet:result_typologies", "schemename": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.21227/mqmt-yq28"], "source": [], "dateofacceptance": {"value": "2020-11-21"}, "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"}, "value": "Coronavirus (COVID-19) Geo-tagged Tweets Dataset"}]} \ No newline at end of file