From a1cafaf2e36cc8c5de2354e723739b85aa1c95bd Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 16 Nov 2021 15:16:28 +0100 Subject: [PATCH 01/11] added mvn site for dnet-hadoop project --- dhp-build/dhp-code-style/pom.xml | 5 +++++ dhp-build/pom.xml | 11 +++++++++++ dhp-build/src/site/site.xml | 22 ++++++++++++++++++++++ dhp-common/pom.xml | 7 +++++++ dhp-workflows/pom.xml | 7 +++++++ dhp-workflows/src/site/site.xml | 25 +++++++++++++++++++++++++ pom.xml | 5 +++++ src/site/site.xml | 21 +++++++++++++++++++++ 8 files changed, 103 insertions(+) create mode 100644 dhp-build/src/site/site.xml create mode 100644 dhp-workflows/src/site/site.xml create mode 100644 src/site/site.xml diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index 77aa2aedb..7a6a32e0e 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -22,6 +22,10 @@ dnet45-releases https://maven.d4science.org/nexus/content/repositories/dnet45-releases + + DHPSite + file://${dhp.site.stage.path}/site/dhp-build/dhp-code-style + @@ -43,6 +47,7 @@ UTF-8 + /tmp/dhp-site \ No newline at end of file diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index 12b999b9c..fed689a06 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -10,6 +10,9 @@ pom This module is a container for the build tools used in dnet-hadoop + + true + dhp-code-style @@ -17,4 +20,12 @@ dhp-build-properties-maven-plugin + + + + DHPSite + file://${dhp.site.stage.path}/site/dhp-build + + + diff --git a/dhp-build/src/site/site.xml b/dhp-build/src/site/site.xml new file mode 100644 index 000000000..2d9d769a2 --- /dev/null +++ b/dhp-build/src/site/site.xml @@ -0,0 +1,22 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index c057123b1..686b89f6b 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -13,6 +13,13 @@ dhp-common jar + + + DHPSite + file://${dhp.site.stage.path}/site/dhp-common + + + This module contains common utilities meant to be used across the dnet-hadoop submodules diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 22ee77619..89ba2bf70 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -15,6 +15,13 @@ This module is the container for the oozie workflow definitions in dnet-hadoop project + + + DHPSite + file://${dhp.site.stage.path}/site/dhp-workflows + + + dhp-workflow-profiles dhp-aggregation diff --git a/dhp-workflows/src/site/site.xml b/dhp-workflows/src/site/site.xml new file mode 100644 index 000000000..6b742db6a --- /dev/null +++ b/dhp-workflows/src/site/site.xml @@ -0,0 +1,25 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index 71c55d1f0..f7e3c6226 100644 --- a/pom.xml +++ b/pom.xml @@ -719,6 +719,10 @@ dnet45-releases https://maven.d4science.org/nexus/content/repositories/dnet45-releases + + DHPSite + file://${dhp.site.stage.path}/site/ + @@ -734,6 +738,7 @@ + /tmp/dhp-site UTF-8 UTF-8 3.6.0 diff --git a/src/site/site.xml b/src/site/site.xml new file mode 100644 index 000000000..634a2c154 --- /dev/null +++ b/src/site/site.xml @@ -0,0 +1,21 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + \ No newline at end of file From 2d67020c590cd4d779e978ca8688c632c0019a57 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 16 Nov 2021 16:01:08 +0100 Subject: [PATCH 02/11] added dhp-enrichment maven site template --- .../src/site/markdown/pubmed.md | 6 ++++- .../dhp-aggregation/src/site/site.xml | 1 - .../dhp-enrichment/src/site/markdown/about.md | 1 + .../dhp-enrichment/src/site/site.xml | 26 +++++++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 dhp-workflows/dhp-enrichment/src/site/markdown/about.md create mode 100644 dhp-workflows/dhp-enrichment/src/site/site.xml diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md index f6327a51b..00e3ed877 100644 --- a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md @@ -4,7 +4,7 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm Collection --------- The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with -the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) +the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) Parsing @@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping |//Author/FullName| author.Forename| Concatenation of forname + lastName if exist | |FOR ALL AUTHOR | author.rank| sequential number starting from 1| +#TODO + +Missing item mapped + diff --git a/dhp-workflows/dhp-aggregation/src/site/site.xml b/dhp-workflows/dhp-aggregation/src/site/site.xml index da5da0f1e..c0a70264d 100644 --- a/dhp-workflows/dhp-aggregation/src/site/site.xml +++ b/dhp-workflows/dhp-aggregation/src/site/site.xml @@ -20,7 +20,6 @@ - diff --git a/dhp-workflows/dhp-enrichment/src/site/markdown/about.md b/dhp-workflows/dhp-enrichment/src/site/markdown/about.md new file mode 100644 index 000000000..c220c63b2 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/site/markdown/about.md @@ -0,0 +1 @@ +#DHP Enrichment \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/site/site.xml b/dhp-workflows/dhp-enrichment/src/site/site.xml new file mode 100644 index 000000000..dad0cd996 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/site/site.xml @@ -0,0 +1,26 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + + + + + \ No newline at end of file From ec8b0219ff034ddeb3d8c878ecf40ec03337e658 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 Nov 2021 17:41:34 +0100 Subject: [PATCH 03/11] [Documentation] Added first page for Integration via unresolved entities generation --- .../src/site/markdown/index.md | 17 +++++++-- .../src/site/markdown/integration.md | 36 +++++++++++++++++++ .../dhp-aggregation/src/site/site.xml | 3 ++ 3 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/site/markdown/integration.md diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md index c0c756082..240617f91 100644 --- a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md @@ -1,9 +1,20 @@ ##DHP-Aggregation -This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. +This module defines a set of oozie workflows for -Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +1. the **collection** and **transformation** of metadata records. +2. the **integration** of new external information in the result + + +### Collection and Transformation + +The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping of each MDStore. -It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). \ No newline at end of file +It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). + +### Integration of external information in the result + +The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph. +See integration section for more insight diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md new file mode 100644 index 000000000..ef19cfd24 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md @@ -0,0 +1,36 @@ +DHP Aggregation - Integration method +===================================== + +The integration method can be applied every time new information, which is not aggregated from the repositories +nor computed directly by OpenAIRE, should be added to the results of the graph. + +The information integrated so far is: + +1. Article impact measures + 1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores +2. Result Subjects + 1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in + results subjects. + + +The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id +and the element in the OAF model that should be used to map the information we want to integrate. + +The id is set by using a particular encoding of the given PID + +*unresolved:[pid]:[pidtype]* + +where + +1 *unresolved* is a constant value +2 *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307 +3 *pidtype* is the persistent id type, e.g. doi + +Such entities are matched against those available in the graph using the result.instance.pid values. + +This mechanism can be used to integrate enrichments produced as associated by a given PID. +If a match will be found with one of the results already in the graph that said result will be enriched with the information +present in the new OAF. +All the objects for which a match is not found are discarded. + + diff --git a/dhp-workflows/dhp-aggregation/src/site/site.xml b/dhp-workflows/dhp-aggregation/src/site/site.xml index c0a70264d..75fc5032e 100644 --- a/dhp-workflows/dhp-aggregation/src/site/site.xml +++ b/dhp-workflows/dhp-aggregation/src/site/site.xml @@ -19,6 +19,9 @@ + + + From 4094f2bb9ab5ad87a31b6691b8b853542d6b7685 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 17 Nov 2021 10:04:52 +0100 Subject: [PATCH 04/11] added integration md file --- .../dhp-aggregation/src/site/markdown/integration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md index ef19cfd24..baf232e40 100644 --- a/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md @@ -22,9 +22,9 @@ The id is set by using a particular encoding of the given PID where -1 *unresolved* is a constant value -2 *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307 -3 *pidtype* is the persistent id type, e.g. doi +1. *unresolved* is a constant value +2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307 +3. *pidtype* is the persistent id type, e.g. doi Such entities are matched against those available in the graph using the result.instance.pid values. From cded363b5595ae9dd3a9cf5cdbe300a86916b0a0 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 17 Nov 2021 11:06:35 +0100 Subject: [PATCH 05/11] code refactor, created and moved scala code on the correct maven folder under src/main/scala and src/test/scala --- .../scholix/SparkCreateActionset.scala | 0 .../scholix/SparkSaveActionSet.scala | 2 +- .../dhp/collection/CollectionUtils.scala | 0 .../dhp/datacite/AbstractRestClient.scala | 0 .../dhp/datacite/DataciteAPIImporter.scala | 0 .../DataciteToOAFTransformation.scala | 0 .../GenerateDataciteDatasetSpark.scala | 3 +-- .../dnetlib/dhp/datacite/ImportDatacite.scala | 0 .../SparkDownloadUpdateDatacite.scala | 0 .../eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala | 1 + .../bio/SparkTransformBioDatabaseToOAF.scala | 12 +++++------ .../ebi/SparkCreateBaselineDataFrame.scala | 2 +- .../sx/bio/ebi/SparkDownloadEBILinks.scala | 3 +-- .../dhp/sx/bio/ebi/SparkEBILinksToOaf.scala | 5 ++--- .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala | 0 .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 20 ++++++++----------- .../{actionmanager => }/datacite/record.json | 0 .../dhp/datacite/DataciteToOAFTest.scala | 0 .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 0 19 files changed, 21 insertions(+), 27 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala (96%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/collection/CollectionUtils.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/AbstractRestClient.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala (95%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/ImportDatacite.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala (99%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala (73%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala (98%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala (98%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala (93%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{java => scala}/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala (96%) rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/{actionmanager => }/datacite/record.json (100%) rename dhp-workflows/dhp-aggregation/src/test/{java => scala}/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala (100%) rename dhp-workflows/dhp-aggregation/src/test/{java => scala}/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala (100%) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala index 1df7ea3fb..62d219b57 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.actionmanager.scholix import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, Oaf, Publication, Software, OtherResearchProduct, Relation} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapred.SequenceFileOutputFormat diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala similarity index 95% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index a63627d1c..3c8caa485 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -3,8 +3,7 @@ package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations -import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH -import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH +import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH} import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} import eu.dnetlib.dhp.schema.oaf.Oaf diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index 70dcc0184..853b24862 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -7,6 +7,7 @@ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.{compact, parse, render} import collection.JavaConverters._ + object BioDBToOAF { case class EBILinkItem(id: Long, links: String) {} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala similarity index 73% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index 8ae8285e3..fcceacd44 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -1,9 +1,9 @@ package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Oaf -import BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} @@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF { import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "PDB" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "SCHOLIX" => - spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "CROSSREF_LINKS" => - spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 17d21f19c..660a26a6c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.oaf.Result -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala index eab6b1dc6..18e39387f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem -import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import org.apache.commons.io.IOUtils import org.apache.http.client.config.RequestConfig import org.apache.http.client.methods.HttpGet diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index 8da617ca0..12af4824b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -1,11 +1,10 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem -import BioDBToOAF.EBILinkItem -import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -38,7 +37,7 @@ object SparkEBILinksToOaf { ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) - .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null) + .flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null) .write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index ecef32202..d5d40ecfe 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -4,10 +4,9 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ -import scala.collection.JavaConverters._ import java.util.regex.Pattern - +import collection.JavaConverters._ /** * */ @@ -22,10 +21,10 @@ object PubMedToOaf { val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") - /** * Cleaning the DOI Applying regex in order to * remove doi starting with URL + * * @param doi input DOI * @return cleaned DOI */ @@ -49,7 +48,7 @@ object PubMedToOaf { * starting from OAF instanceType value * * @param cobjQualifier OAF instance type - * @param vocabularies All dnet vocabularies + * @param vocabularies All dnet vocabularies * @return the correct instance */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { @@ -65,7 +64,7 @@ object PubMedToOaf { } /** - * Mapping the Pubmedjournal info into the OAF Journale + * Mapping the Pubmedjournal info into the OAF Journale * * @param j the pubmedJournal * @return the OAF Journal @@ -91,9 +90,8 @@ object PubMedToOaf { * Find vocabulary term into synonyms and term in the vocabulary * * @param vocabularyName the input vocabulary name - * @param vocabularies all the vocabularies - * @param term the term to find - * + * @param vocabularies all the vocabularies + * @param term the term to find * @return the cleaned term value */ def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { @@ -104,10 +102,9 @@ object PubMedToOaf { /** - * Map the Pubmed Article into the OAF instance + * Map the Pubmed Article into the OAF instance * - * - * @param article the pubmed articles + * @param article the pubmed articles * @param vocabularies the vocabularies * @return The OAF instance if the mapping did not fail */ @@ -185,7 +182,6 @@ object PubMedToOaf { //-------------------------------------------------------------------------------------- - // RESULT MAPPING //-------------------------------------------------------------------------------------- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/datacite/record.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/datacite/record.json diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala rename to dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala rename to dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala From 2fd9ceac13d38ee4c2df06ab834de61ff8309972 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 17 Nov 2021 11:35:22 +0100 Subject: [PATCH 06/11] code refactor, created and moved scala code on the correct maven folder under src/main/scala and src/test/scala --- .../doiboost/DoiBoostMappingUtil.scala | 0 .../SparkGenerateDOIBoostActionSet.scala | 0 .../doiboost/SparkGenerateDoiBoost.scala | 0 .../doiboost/crossref/Crossref2Oaf.scala | 9 +++--- .../doiboost/crossref/CrossrefDataset.scala | 25 ++++++++-------- .../doiboost/crossref/CrossrefImporter.java | 0 .../dnetlib/doiboost/crossref/ESClient.java | 0 .../crossref/ExtractCrossrefRecords.java | 0 .../crossref/GenerateCrossrefDataset.scala | 20 +++++-------- .../crossref/SparkMapDumpIntoOAF.scala | 4 +-- .../crossref/UnpackCrtossrefEntries.scala | 9 ++---- .../dnetlib/doiboost/mag/MagDataModel.scala | 2 +- .../mag/SparkImportMagIntoDataset.scala | 7 ++--- .../doiboost/mag/SparkProcessMAG.scala | 20 +++++-------- .../dnetlib/doiboost/orcid/ORCIDToOAF.scala | 7 ++--- .../orcid/SparkConvertORCIDToOAF.scala | 8 ++--- .../doiboost/orcid/SparkPreprocessORCID.scala | 29 +++++++++---------- .../doiboost/uw/SparkMapUnpayWallToOAF.scala | 8 ++--- .../dnetlib/doiboost/uw/UnpayWallToOAF.scala | 3 +- .../doiboost/DoiBoostHostedByMapTest.scala | 2 +- .../dnetlib}/doiboost/NormalizeDoiTest.scala | 2 +- .../crossref/CrossrefMappingTest.scala | 0 .../dnetlib/doiboost/mag/MAGMappingTest.scala | 2 +- .../orcid/MappingORCIDToOAFTest.scala | 3 +- .../doiboost/uw/UnpayWallMappingTest.scala | 4 +-- 25 files changed, 69 insertions(+), 95 deletions(-) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala (99%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala (77%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/CrossrefImporter.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/ESClient.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala (73%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala (96%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala (88%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/mag/MagDataModel.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala (98%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala (91%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala (98%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala (84%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala (67%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala (80%) rename dhp-workflows/dhp-doiboost/src/main/{java => scala}/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala (98%) rename dhp-workflows/dhp-doiboost/src/test/{java/eu/dnetlib/dhp => scala/eu/dnetlib}/doiboost/DoiBoostHostedByMapTest.scala (98%) rename dhp-workflows/dhp-doiboost/src/test/{java/eu/dnetlib/dhp => scala/eu/dnetlib}/doiboost/NormalizeDoiTest.scala (96%) rename dhp-workflows/dhp-doiboost/src/test/{java => scala}/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala (100%) rename dhp-workflows/dhp-doiboost/src/test/{java => scala}/eu/dnetlib/doiboost/mag/MAGMappingTest.scala (100%) rename dhp-workflows/dhp-doiboost/src/test/{java => scala}/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala (99%) rename dhp-workflows/dhp-doiboost/src/test/{java => scala}/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala (100%) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 1b1c850ba..edca4a180 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -4,20 +4,19 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.utils.DHPUtils -import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _} +import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.apache.commons.lang.StringUtils import org.json4s import org.json4s.DefaultFormats -import org.json4s.JsonAST.{JValue, _} +import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ import org.slf4j.{Logger, LoggerFactory} +import java.util import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -import java.util - -import eu.dnetlib.doiboost.DoiBoostMappingUtil case class CrossrefDT(doi: String, json:String, timestamp: Long) {} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala similarity index 77% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 159b817c7..6a1c701af 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.Aggregator -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -17,12 +17,12 @@ object CrossrefDataset { val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) - def to_item(input:String):CrossrefDT = { + def to_item(input: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] - val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val ts: Long = (json \ "indexed" \ "timestamp").extract[Long] + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) CrossrefDT(doi, input, ts) } @@ -30,7 +30,6 @@ object CrossrefDataset { def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) parser.parseArgument(args) @@ -54,7 +53,7 @@ object CrossrefDataset { return b - if(a.timestamp >b.timestamp) { + if (a.timestamp > b.timestamp) { return a } b @@ -66,7 +65,7 @@ object CrossrefDataset { if (a == null) return b - if(a.timestamp >b.timestamp) { + if (a.timestamp > b.timestamp) { return a } b @@ -79,20 +78,20 @@ object CrossrefDataset { override def finish(reduction: CrossrefDT): CrossrefDT = reduction } - val workingPath:String = parser.get("workingPath") + val workingPath: String = parser.get("workingPath") - val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] + val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] val update = - spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) - .map(i =>CrossrefImporter.decompressBlob(i._2.toString)) - .map(i =>to_item(i))) + spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) + .map(i => CrossrefImporter.decompressBlob(i._2.toString)) + .map(i => to_item(i))) main_ds.union(update).groupByKey(_.doi) .agg(crossrefAggregator.toColumn) - .map(s=>s._2) + .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefImporter.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefImporter.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ESClient.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ESClient.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala similarity index 73% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index 526ff7b3a..6d03abc25 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -2,17 +2,12 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.doiboost.DoiBoostMappingUtil -import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item -import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass -import org.apache.hadoop.io.{IntWritable, Text} -import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.rdd.RDD -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} import org.json4s import org.json4s.DefaultFormats -import org.json4s.JsonAST.JArray -import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.io.Source @@ -24,11 +19,10 @@ object GenerateCrossrefDataset { implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] - def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) - val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] CrossrefDT(doi, meta, timestamp) @@ -51,14 +45,14 @@ object GenerateCrossrefDataset { import spark.implicits._ - val tmp : RDD[String] = sc.textFile(sourcePath,6000) + val tmp: RDD[String] = sc.textFile(sourcePath, 6000) spark.createDataset(tmp) .map(entry => crossrefElement(entry)) .write.mode(SaveMode.Overwrite).save(targetPath) -// .map(meta => crossrefElement(meta)) -// .toDS.as[CrossrefDT] -// .write.mode(SaveMode.Overwrite).save(targetPath) + // .map(meta => crossrefElement(meta)) + // .toDS.as[CrossrefDT] + // .write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala similarity index 96% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index c65916610..fa55b9fb9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -4,10 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset} import org.apache.commons.io.IOUtils - import org.apache.spark.SparkConf - -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala similarity index 88% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala index 95ecb568b..191c4587e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala @@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.hadoop.io.compress.GzipCodec +import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST.JArray @@ -17,9 +17,7 @@ object UnpackCrtossrefEntries { val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) - - - def extractDump(input:String):List[String] = { + def extractDump(input: String): List[String] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -30,7 +28,6 @@ object UnpackCrtossrefEntries { } - def main(args: Array[String]): Unit = { val conf = new SparkConf val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) @@ -45,7 +42,7 @@ object UnpackCrtossrefEntries { .getOrCreate() val sc: SparkContext = spark.sparkContext - sc.wholeTextFiles(sourcePath,6000).flatMap(d =>extractDump(d._2)) + sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2)) .saveAsTextFile(targetPath, classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala index fd9629024..0a6fa00f0 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import scala.collection.JavaConverters._ import scala.collection.mutable diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index a68d0bb2d..d25a4893f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.types._ +import org.apache.spark.sql.{SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkImportMagIntoDataset { @@ -24,13 +24,13 @@ object SparkImportMagIntoDataset { "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), + "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")), + "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")), "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), @@ -75,7 +75,6 @@ object SparkImportMagIntoDataset { .master(parser.get("master")).getOrCreate() - stream.foreach { case (k, v) => val s: StructType = getSchema(k) val df = spark.read diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala similarity index 91% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 016279787..932725446 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -5,19 +5,16 @@ import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions.{col, collect_list, struct} import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - import scala.collection.JavaConverters._ - object SparkProcessMAG { - def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={ + def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { d.where(col("Doi").isNotNull) .groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING) - .reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2)) + .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) .map(_._2)(Encoders.product[MagPapers]) .map(mp => { new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), @@ -98,13 +95,13 @@ object SparkProcessMAG { var magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingPath/merge_step_2").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] val conference = spark.read.load(s"$sourcePath/ConferenceInstances") - .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" ) + .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate") val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci"))) - .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] + .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left") @@ -122,7 +119,7 @@ object SparkProcessMAG { magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left") .map(item => ConversionUtil.updatePubsWithDescription(item) - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") logger.info("Phase 7) Enrich Publication with FieldOfStudy") @@ -148,11 +145,10 @@ object SparkProcessMAG { spark.read.load(s"$workingPath/mag_publication").as[Publication] .filter(p => p.getId == null) .groupByKey(p => p.getId) - .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) + .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 1cd3f7028..11031f9ca 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -4,17 +4,16 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} -import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import org.apache.commons.lang.StringUtils -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ +import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.JavaConverters._ case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala similarity index 84% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index fa4a93e00..1b189e296 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -11,10 +11,10 @@ object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def run(spark:SparkSession, workingPath:String, targetPath:String) :Unit = { + def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] import spark.implicits._ - val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] + val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] logger.info("Converting ORCID to OAF") dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath) @@ -35,8 +35,8 @@ object SparkConvertORCIDToOAF { val workingPath = parser.get("workingPath") val targetPath = parser.get("targetPath") - run(spark,workingPath, targetPath) + run(spark, workingPath, targetPath) } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala similarity index 67% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala index 31f331912..153be5dd1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala @@ -1,48 +1,45 @@ package eu.dnetlib.doiboost.orcid -import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.schema.oaf.Publication -import eu.dnetlib.dhp.schema.orcid.OrcidDOI import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.functions.{col, collect_list} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkPreprocessORCID { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def fixORCIDItem(item :ORCIDItem):ORCIDItem = { - ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) + def fixORCIDItem(item: ORCIDItem): ORCIDItem = { + ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) } - def run(spark:SparkSession,sourcePath:String,workingPath:String):Unit = { + def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] - val inputRDD:RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s)) + val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s)) spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author") - val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null) + val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null) spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works") - val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] + val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] - val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] + val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] works.joinWith(authors, authors("oid").equalTo(works("oid"))) - .map(i =>{ + .map(i => { val doi = i._1.doi val author = i._2 - (doi, author) - }).groupBy(col("_1").alias("doi")) + (doi, author) + }).groupBy(col("_1").alias("doi")) .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] .map(s => fixORCIDItem(s)) .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") @@ -67,4 +64,4 @@ object SparkPreprocessORCID { } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala similarity index 80% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala index 4530926f1..70290018d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala @@ -1,16 +1,14 @@ package eu.dnetlib.doiboost.uw import eu.dnetlib.dhp.application.ArgumentApplicationParser - import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - object SparkMapUnpayWallToOAF { def main(args: Array[String]): Unit = { @@ -32,11 +30,11 @@ object SparkMapUnpayWallToOAF { val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") - val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath") + val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath") logger.info("Converting UnpayWall to OAF") - val d:Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication] + val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication] d.write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala index c8324cde1..bf5694965 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala @@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ -import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala index 4912648be..049ac37f4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.doiboost +package eu.dnetlib.doiboost import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType} diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala similarity index 96% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala index a9a841ee9..bdf845f19 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.doiboost +package eu.dnetlib.doiboost import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.junit.jupiter.api.Test diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala index 46d4ec08d..7403e103e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala @@ -3,9 +3,9 @@ package eu.dnetlib.doiboost.mag import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, SparkSession} import org.codehaus.jackson.map.ObjectMapper +import org.json4s.DefaultFormats import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test -import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} import java.sql.Timestamp diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala index b484dc087..a5ce6296c 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -10,9 +10,8 @@ import org.junit.jupiter.api.io.TempDir import org.slf4j.{Logger, LoggerFactory} import java.nio.file.Path -import scala.io.Source - import scala.collection.JavaConversions._ +import scala.io.Source class MappingORCIDToOAFTest { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala index fa696fffc..012ed3da0 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala @@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.uw import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute +import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test +import org.slf4j.{Logger, LoggerFactory} import scala.io.Source -import org.junit.jupiter.api.Assertions._ -import org.slf4j.{Logger, LoggerFactory} class UnpayWallMappingTest { From 1f5ee116ed7d897831fcb74818f6c5a5cc9ee76b Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 17 Nov 2021 12:23:52 +0100 Subject: [PATCH 07/11] code refactor, created and moved scala code on the correct maven folder under src/main/scala and src/test/scala fixed test --- .../sx/graphimport/SparkDataciteToOAF.scala | 31 ---- .../oa/graph/hostedbymap/Aggregators.scala | 2 +- .../SparkApplyHostedByMapToDatasource.scala | 13 +- .../SparkApplyHostedByMapToResult.scala | 18 +-- .../SparkPrepareHostedByInfoToApply.scala | 35 ++--- .../hostedbymap/SparkProduceHostedByMap.scala | 109 +++++++------ .../raw/CopyHdfsOafSparkApplication.scala | 10 +- .../resolution/SparkResolveEntities.scala | 25 ++- .../resolution/SparkResolveRelation.scala | 2 +- .../graph/SparkConvertDatasetToJsonRDD.scala | 10 +- .../sx/graph/SparkConvertObjectToJson.scala | 10 +- .../sx/graph/SparkConvertRDDtoDataset.scala | 27 ++-- .../dhp/sx/graph/SparkCreateInputGraph.scala | 27 ++-- .../dhp/sx/graph/SparkCreateScholix.scala | 28 ++-- .../sx/graph/SparkCreateSummaryObject.scala | 12 +- .../dhp/sx/graph/pangaea/PangaeaUtils.scala | 1 + .../SparkGeneratePanagaeaDataset.scala | 18 +-- .../dhp/sx/graph/scholix/ScholixUtils.scala | 143 +++++++++--------- .../dnetlib/dhp/oa/graph/resolution/dataset | 6 +- .../oa/graph/hostedbymap/DownloadCsvTest.java | 0 .../dhp/oa/graph/hostedbymap/TestApply.scala | 0 .../oa/graph/hostedbymap/TestPrepare.scala | 4 - .../oa/graph/hostedbymap/TestPreprocess.scala | 5 +- .../resolution/ResolveEntitiesTest.scala | 1 + .../sx/graph/scholix/ScholixGraphTest.scala | 0 .../dhp/sx/pangaea/PangaeaTransformTest.scala | 1 - 26 files changed, 235 insertions(+), 303 deletions(-) delete mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala (81%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala (85%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala (74%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala (61%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala (88%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala (79%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala (99%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala (69%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala (83%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala (62%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala (76%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala (76%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala (68%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala (99%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala (83%) rename dhp-workflows/dhp-graph-mapper/src/main/{java => scala}/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala (61%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java (100%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala (96%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala (98%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala (99%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/test/{java => scala}/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala (95%) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala deleted file mode 100644 index 9e905d806..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala +++ /dev/null @@ -1,31 +0,0 @@ -package eu.dnetlib.dhp.oa.sx.graphimport - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import org.apache.commons.io.IOUtils -import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession - -object SparkDataciteToOAF { - - - def main(args: Array[String]): Unit = { - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json"))) - parser.parseArgument(args) - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - import spark.implicits._ - - - val sc = spark.sparkContext - - val inputPath = parser.get("inputPath") - - - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala index ce383292c..ad4e1c96e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo -import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala similarity index 81% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala index 1b18ba3ae..38af3eee4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala @@ -2,13 +2,12 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication} +import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} @@ -52,18 +51,18 @@ object SparkApplyHostedByMapToDatasource { val mapper = new ObjectMapper() - val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") + val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") .map(r => mapper.readValue(r, classOf[Datasource])) - val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath) + val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath) .map(ei => mapper.readValue(ei, classOf[EntityInfo]))) - applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) + applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression","gzip") + .option("compression", "gzip") .text(graphPath + "/datasource") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala similarity index 85% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index 0e047d016..d360da2e9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -5,16 +5,13 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils -import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication} +import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} - import scala.collection.JavaConverters._ - - object SparkApplyHostedByMapToResult { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { @@ -39,6 +36,7 @@ object SparkApplyHostedByMapToResult { p })(Encoders.bean(classOf[Publication])) } + def main(args: Array[String]): Unit = { @@ -67,18 +65,18 @@ object SparkApplyHostedByMapToResult { implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) val mapper = new ObjectMapper() - val pubs : Dataset[Publication] = spark.read.textFile(graphPath + "/publication") + val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication") .map(r => mapper.readValue(r, classOf[Publication])) - val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) - .map(ei => mapper.readValue(ei, classOf[EntityInfo])) + val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) + .map(ei => mapper.readValue(ei, classOf[EntityInfo])) - applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) + applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression","gzip") + .option("compression", "gzip") .text(graphPath + "/publication") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala similarity index 74% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala index b7a7d352f..87e203e4b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala @@ -3,61 +3,58 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo - import eu.dnetlib.dhp.schema.oaf.{Journal, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} - - object SparkPrepareHostedByInfoToApply { implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = { - var lst:List[EntityInfo] = List() + def getList(id: String, j: Journal, name: String): List[EntityInfo] = { + var lst: List[EntityInfo] = List() - if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){ + if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst } - if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){ + if (j.getIssnOnline != null && !j.getIssnOnline.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst } - if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){ + if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")) { lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst } lst } - def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = { + def prepareResultInfo(spark: SparkSession, publicationPath: String): Dataset[EntityInfo] = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) val mapper = new ObjectMapper() - val dd : Dataset[Publication] = spark.read.textFile(publicationPath) + val dd: Dataset[Publication] = spark.read.textFile(publicationPath) .map(r => mapper.readValue(r, classOf[Publication])) - dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, "")) + dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, "")) } - def toEntityInfo(input:String): EntityInfo = { + def toEntityInfo(input: String): EntityInfo = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]] + val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]] toEntityItem(c.keys.head, c.values.head) } - def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = { + def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = { EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess) @@ -67,7 +64,7 @@ object SparkPrepareHostedByInfoToApply { Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") .map(t2 => { val res: EntityInfo = t2._1 - if(t2._2 != null ){ + if (t2._2 != null) { val ds = t2._2 res.setHostedById(ds.getId) res.setOpenAccess(ds.getOpenAccess) @@ -107,10 +104,10 @@ object SparkPrepareHostedByInfoToApply { //STEP1: read the hostedbymap and transform it in EntityInfo - val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) + val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) - //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) - val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") + //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) + val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") //STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just //one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala index 1ee1d5d1a..6dfe35623 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala @@ -1,41 +1,39 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel} import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.commons.io.IOUtils +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} -import com.fasterxml.jackson.databind.ObjectMapper -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FileSystem -import org.apache.hadoop.fs.Path + import java.io.PrintWriter -import org.apache.hadoop.io.compress.GzipCodec - - object SparkProduceHostedByMap { implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) - def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)) : HostedByItemType = { + def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = { val openaire: HostedByInfo = input._1._1 val doaj: HostedByInfo = input._1._2 val gold: HostedByInfo = input._2 val isOpenAccess: Boolean = doaj == null && gold == null openaire.journal_id match { - case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) - case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) - case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) + case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) + case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) + case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) // catch the default with a variable so you can print it - case whoa => null + case whoa => null } } @@ -44,7 +42,7 @@ object SparkProduceHostedByMap { implicit val formats = org.json4s.DefaultFormats - val map: Map [String, HostedByItemType] = Map (input._1 -> input._2 ) + val map: Map[String, HostedByItemType] = Map(input._1 -> input._2) Serialization.write(map) @@ -52,34 +50,33 @@ object SparkProduceHostedByMap { } - - def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = { - if(issn != null){ - if(eissn != null){ - if(issnl != null){ - HostedByItemType(id, officialname, issn, eissn, issnl , oa) - }else{ - HostedByItemType(id, officialname, issn, eissn, "" , oa) + def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { + if (issn != null) { + if (eissn != null) { + if (issnl != null) { + HostedByItemType(id, officialname, issn, eissn, issnl, oa) + } else { + HostedByItemType(id, officialname, issn, eissn, "", oa) } - }else{ - if(issnl != null){ - HostedByItemType(id, officialname, issn, "", issnl , oa) - }else{ - HostedByItemType(id, officialname, issn, "", "" , oa) + } else { + if (issnl != null) { + HostedByItemType(id, officialname, issn, "", issnl, oa) + } else { + HostedByItemType(id, officialname, issn, "", "", oa) } } - }else{ - if(eissn != null){ - if(issnl != null){ - HostedByItemType(id, officialname, "", eissn, issnl , oa) - }else{ - HostedByItemType(id, officialname, "", eissn, "" , oa) + } else { + if (eissn != null) { + if (issnl != null) { + HostedByItemType(id, officialname, "", eissn, issnl, oa) + } else { + HostedByItemType(id, officialname, "", eissn, "", oa) } - }else{ - if(issnl != null){ - HostedByItemType(id, officialname, "", "", issnl , oa) - }else{ - HostedByItemType("", "", "", "", "" , oa) + } else { + if (issnl != null) { + HostedByItemType(id, officialname, "", "", issnl, oa) + } else { + HostedByItemType("", "", "", "", "", oa) } } } @@ -90,10 +87,10 @@ object SparkProduceHostedByMap { return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false) } - HostedByItemType("","","","","",false) + HostedByItemType("", "", "", "", "", false) } - def oaHostedByDataset(spark:SparkSession, datasourcePath : String) : Dataset[HostedByItemType] = { + def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ @@ -102,10 +99,10 @@ object SparkProduceHostedByMap { implicit var encoderD = Encoders.kryo[Datasource] - val dd : Dataset[Datasource] = spark.read.textFile(datasourcePath) + val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[Datasource])) - dd.map{ddt => oaToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } @@ -115,17 +112,17 @@ object SparkProduceHostedByMap { } - def goldHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { + def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel] val mapper = new ObjectMapper() - val dd : Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[UnibiGoldModel])) - dd.map{ddt => goldToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } @@ -134,41 +131,40 @@ object SparkProduceHostedByMap { return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true) } - def doajHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { + def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel] val mapper = new ObjectMapper() - val dd : Dataset[DOAJModel] = spark.read.textFile(datasourcePath) + val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[DOAJModel])) - dd.map{ddt => doajToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) + dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) } def toList(input: HostedByItemType): List[(String, HostedByItemType)] = { - var lst : List[(String, HostedByItemType)] = List() - if(!input.issn.equals("")){ + var lst: List[(String, HostedByItemType)] = List() + if (!input.issn.equals("")) { lst = (input.issn, input) :: lst } - if(!input.eissn.equals("")){ + if (!input.eissn.equals("")) { lst = (input.eissn, input) :: lst } - if(!input.lissn.equals("")){ + if (!input.lissn.equals("")) { lst = (input.lissn, input) :: lst } lst } - - def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = { + def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { val conf = new Configuration() conf.set("fs.defaultFS", hdfsNameNode) - val fs= FileSystem.get(conf) + val fs = FileSystem.get(conf) val output = fs.create(new Path(outputPath)) val writer = new PrintWriter(output) try { @@ -182,7 +178,6 @@ object SparkProduceHostedByMap { } - def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) @@ -213,7 +208,7 @@ object SparkProduceHostedByMap { .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) .flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|")) .map(hbi => toHostedByMap(hbi))(Encoders.STRING) - .rdd.saveAsTextFile(outputPath , classOf[GzipCodec]) + .rdd.saveAsTextFile(outputPath, classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala similarity index 88% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index c7ad1890d..0179cc266 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -4,20 +4,14 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.ModelSupport -import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.io.IOUtils -import org.apache.commons.lang3.StringUtils -import org.apache.http.client.methods.HttpGet -import org.apache.http.impl.client.HttpClients import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory -import scala.collection.JavaConverters._ import scala.io.Source - +import scala.collection.JavaConverters._ object CopyHdfsOafSparkApplication { def main(args: Array[String]): Unit = { @@ -59,7 +53,7 @@ object CopyHdfsOafSparkApplication { if (validPaths.nonEmpty) { val oaf = spark.read.load(validPaths: _*).as[Oaf] val mapper = new ObjectMapper() - val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList + val l = ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList l.foreach( e => oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala similarity index 79% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala index 316b8afed..6b4a501d6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.EntityType import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils @@ -14,7 +13,7 @@ import org.slf4j.{Logger, LoggerFactory} object SparkResolveEntities { val mapper = new ObjectMapper() - val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct) + val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct) def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) @@ -51,10 +50,10 @@ object SparkResolveEntities { fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e")) } -} + } -def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { + def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ @@ -71,22 +70,22 @@ def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: St } - def deserializeObject(input:String, entity:EntityType ) :Result = { + def deserializeObject(input: String, entity: EntityType): Result = { - entity match { - case EntityType.publication => mapper.readValue(input, classOf[Publication]) - case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) - case EntityType.software=> mapper.readValue(input, classOf[Software]) - case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct]) - } + entity match { + case EntityType.publication => mapper.readValue(input, classOf[Publication]) + case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) + case EntityType.software => mapper.readValue(input, classOf[Software]) + case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct]) + } } - def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = { + def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ - val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] + val re: Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] entities.foreach { e => diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala index cd517dd5e..c7f9b2d0e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport -import eu.dnetlib.dhp.schema.oaf.{Relation, Result} +import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileSystem, Path} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala similarity index 69% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala index 3ee0c7dd6..9d16cf907 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.Result import org.apache.commons.io.IOUtils import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf @@ -29,13 +29,13 @@ object SparkConvertDatasetToJsonRDD { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val resultObject = List("publication","dataset","software", "otherResearchProduct") + val resultObject = List("publication", "dataset", "software", "otherResearchProduct") val mapper = new ObjectMapper() - implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - resultObject.foreach{item => - spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) + resultObject.foreach { item => + spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index 846ac37af..cc1b97fd6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import org.apache.commons.io.IOUtils +import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} -import org.apache.hadoop.io.compress._ object SparkConvertObjectToJson { @@ -32,8 +32,8 @@ object SparkConvertObjectToJson { log.info(s"objectType -> $objectType") - implicit val scholixEncoder :Encoder[Scholix]= Encoders.kryo[Scholix] - implicit val summaryEncoder :Encoder[ScholixSummary]= Encoders.kryo[ScholixSummary] + implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] val mapper = new ObjectMapper @@ -42,11 +42,11 @@ object SparkConvertObjectToJson { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) case "summary" => log.info("Serialize Summary") val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala similarity index 62% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 4b82fe645..2eb5e3a35 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -2,11 +2,12 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + object SparkConvertRDDtoDataset { def main(args: Array[String]): Unit = { @@ -31,39 +32,39 @@ object SparkConvertRDDtoDataset { val entityPath = s"$t/entities" val relPath = s"$t/relation" val mapper = new ObjectMapper() - implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) - implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) - implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) - implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) + implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) + implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) + implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) log.info("Converting dataset") - val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) + val rddDataset = spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset") log.info("Converting publication") - val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) + val rddPublication = spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication") log.info("Converting software") - val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) + val rddSoftware = spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software") log.info("Converting otherresearchproduct") - val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) + val rddOtherResearchProduct = spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct") log.info("Converting Relation") - val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin") + val relationSemanticFilter = List("cites", "iscitedby", "merges", "ismergedin") - val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation") + val rddRelation = spark.sparkContext.textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) - .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index 350b00c5e..b6f678967 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -1,14 +1,12 @@ package eu.dnetlib.dhp.sx.graph import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} - - object SparkCreateInputGraph { def main(args: Array[String]): Unit = { @@ -33,7 +31,7 @@ object SparkCreateInputGraph { ) - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) @@ -41,16 +39,13 @@ object SparkCreateInputGraph { implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - - - val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val oafDs:Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] + val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] log.info("Extract Publication") @@ -70,27 +65,27 @@ object SparkCreateInputGraph { resultObject.foreach { r => log.info(s"Make ${r._1} unique") - makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/preprocess/${r._1}",spark, r._2) + makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2) } } - def extractEntities[T <: Oaf ](oafDs:Dataset[Oaf], targetPath:String, clazz:Class[T], log:Logger) :Unit = { + def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = { - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) log.info(s"Extract ${clazz.getSimpleName}") oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath) } - def makeDatasetUnique[T <: Result ](sourcePath:String, targetPath:String, spark:SparkSession, clazz:Class[T]) :Unit = { + def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = { import spark.implicits._ - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) - val ds:Dataset[T] = spark.read.load(sourcePath).as[T] + val ds: Dataset[T] = spark.read.load(sourcePath).as[T] - ds.groupByKey(_.getId).reduceGroups{(x,y) => + ds.groupByKey(_.getId).reduceGroups { (x, y) => x.mergeFrom(y) x }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index e4fcd2782..9930c57af 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.count -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkCreateScholix { @@ -42,7 +42,7 @@ object SparkCreateScholix { val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation] - .filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + .filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary] @@ -51,54 +51,54 @@ object SparkCreateScholix { relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => - if (input._1!= null && input._2!= null) { + if (input._1 != null && input._2 != null) { val rel: Relation = input._1._2 val source: ScholixSummary = input._2._2 (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) } - else null + else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) - .filter(r => r!= null) + .filter(r => r != null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Scholix), (String, ScholixSummary)) => - if (input._2== null) { + if (input._2 == null) { null } else { val s: Scholix = input._1._2 val target: ScholixSummary = input._2._2 ScholixUtils.generateCompleteScholix(s, target) } - }.filter(s => s!= null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") + }.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix] - .map(s=> (s.getIdentifier,s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) + .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .groupByKey(_._1) .agg(ScholixUtils.scholixAggregator.toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix") - val scholix_final:Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] + val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] - val stats:Dataset[(String,String,Long)]= scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String,String,Long)] + val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)] stats - .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0 )) + .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0)) .groupByKey(_.id) - .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset+b.relatedDataset, a.relatedPublication+b.relatedPublication)) + .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities") - val relatedEntitiesDS:Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication>0 || r.relatedDataset > 0) + val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) - relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map{i => + relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i => val re = i._1 val sum = i._2._2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala similarity index 68% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala index 0970375f5..4274cae5a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} object SparkCreateSummaryObject { @@ -28,15 +28,15 @@ object SparkCreateSummaryObject { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] - implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r=>r.getDataInfo== null || r.getDataInfo.getDeletedbyinference== false) + val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) - ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath) + ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala index 193512474..c70397d04 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala @@ -5,6 +5,7 @@ import org.apache.spark.sql.{Encoder, Encoders} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse + import java.util.regex.Pattern import scala.language.postfixOps import scala.xml.{Elem, Node, XML} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala index 79c75d6df..f1a4553ea 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala @@ -2,13 +2,12 @@ package eu.dnetlib.dhp.sx.graph.pangaea import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.spark.rdd.RDD -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.{Logger, LoggerFactory} -import scala.collection.JavaConverters._ import scala.io.Source - +import scala.collection.JavaConverters._ object SparkGeneratePanagaeaDataset { @@ -28,17 +27,17 @@ object SparkGeneratePanagaeaDataset { parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}")) logger.info("Converting sequential file into Dataset") - val sc:SparkContext = spark.sparkContext + val sc: SparkContext = spark.sparkContext - val workingPath:String = parser.get("workingPath") + val workingPath: String = parser.get("workingPath") implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] - val inputRDD:RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) + val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) spark.createDataset(inputRDD).as[PangaeaDataModel] - .map(s => (s.identifier,s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) - .groupByKey(_._1)(Encoders.STRING) + .map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) + .groupByKey(_._1)(Encoders.STRING) .agg(PangaeaUtils.getDatasetAggregator().toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") @@ -46,7 +45,4 @@ object SparkGeneratePanagaeaDataset { } - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index 93c554e04..7b1ddbb8f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.sx.graph.scholix - import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.sx.scholix._ import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} @@ -11,22 +10,23 @@ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse -import scala.collection.JavaConverters._ import scala.io.Source -import scala.language.postfixOps +import scala.collection.JavaConverters._ + object ScholixUtils { val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" - val DATE_RELATION_KEY:String = "RelationDate" - case class RelationVocabulary(original:String, inverse:String){} + val DATE_RELATION_KEY: String = "RelationDate" - case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){} + case class RelationVocabulary(original: String, inverse: String) {} - val relations:Map[String, RelationVocabulary] = { - val input =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString + case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} + + val relations: Map[String, RelationVocabulary] = { + val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -35,12 +35,12 @@ object ScholixUtils { } - def extractRelationDate(relation: Relation):String = { + def extractRelationDate(relation: Relation): String = { - if (relation.getProperties== null || !relation.getProperties.isEmpty) + if (relation.getProperties == null || !relation.getProperties.isEmpty) null else { - val date =relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) + val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) if (date.isDefined) date.get else @@ -48,9 +48,9 @@ object ScholixUtils { } } - def extractRelationDate(summary: ScholixSummary):String = { + def extractRelationDate(summary: ScholixSummary): String = { - if(summary.getDate== null || summary.getDate.isEmpty) + if (summary.getDate == null || summary.getDate.isEmpty) null else { summary.getDate.get(0) @@ -59,15 +59,14 @@ object ScholixUtils { } - def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = { + def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) } - - val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable { + val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { override def zero: RelatedEntities = null override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { @@ -78,17 +77,16 @@ object ScholixUtils { if (b == null) RelatedEntities(a._1, relatedDataset, relatedPublication) else - RelatedEntities(a._1,b.relatedDataset+ relatedDataset, b.relatedPublication+ relatedPublication ) + RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication) } override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { - if (b1!= null && b2!= null) - RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication) + if (b1 != null && b2 != null) + RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) + else if (b1 != null) + b1 else - if (b1!= null) - b1 - else b2 } @@ -104,12 +102,12 @@ object ScholixUtils { override def zero: Scholix = null - def scholix_complete(s:Scholix):Boolean ={ - if (s== null || s.getIdentifier==null) { + def scholix_complete(s: Scholix): Boolean = { + if (s == null || s.getIdentifier == null) { false } else if (s.getSource == null || s.getTarget == null) { - false - } + false + } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) false else @@ -121,7 +119,7 @@ object ScholixUtils { } override def merge(b1: Scholix, b2: Scholix): Scholix = { - if (scholix_complete(b1)) b1 else b2 + if (scholix_complete(b1)) b1 else b2 } override def finish(reduction: Scholix): Scholix = reduction @@ -132,7 +130,7 @@ object ScholixUtils { } - def createInverseScholixRelation(scholix: Scholix):Scholix = { + def createInverseScholixRelation(scholix: Scholix): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -144,34 +142,33 @@ object ScholixUtils { s - } - def extractCollectedFrom(summary:ScholixSummary): List[ScholixEntityId] = { - if (summary.getDatasources!= null && !summary.getDatasources.isEmpty) { - val l: List[ScholixEntityId] = summary.getDatasources.asScala.map{ + def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { + if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { + val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava) }(collection.breakOut) - l + l } else List() } - def extractCollectedFrom(relation: Relation) : List[ScholixEntityId] = { + def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => - new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA,null)).asJava) + new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava) }(collection breakOut) l } else List() } - def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = { + def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -192,29 +189,28 @@ object ScholixUtils { r.setObjectType(summaryObject.getTypology.toString) r.setObjectSubType(summaryObject.getSubType) - if (summaryObject.getTitle!= null && !summaryObject.getTitle.isEmpty) - r.setTitle(summaryObject.getTitle.get(0)) + if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty) + r.setTitle(summaryObject.getTitle.get(0)) - if (summaryObject.getAuthor!= null && !summaryObject.getAuthor.isEmpty){ - val l:List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a,null)).toList + if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { + val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList if (l.nonEmpty) r.setCreator(l.asJava) } - if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty) + if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) r.setPublicationDate(summaryObject.getDate.get(0)) - if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty) - { - val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList + if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { + val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList if (plist.nonEmpty) r.setPublisher(plist.asJava) } - if (summaryObject.getDatasources!= null && !summaryObject.getDatasources.isEmpty) { + if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { - val l:List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( + val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava) , "collected", "complete" @@ -228,12 +224,9 @@ object ScholixUtils { } + def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { - - - def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = { - - if (relation== null || source== null) + if (relation == null || source == null) return null val s = new Scholix @@ -253,9 +246,9 @@ object ScholixUtils { s.setPublicationDate(d) - if (source.getPublisher!= null && !source.getPublisher.isEmpty) { + if (source.getPublisher != null && !source.getPublisher.isEmpty) { val l: List[ScholixEntityId] = source.getPublisher.asScala - .map{ + .map { p => new ScholixEntityId(p, null) }(collection.breakOut) @@ -265,7 +258,7 @@ object ScholixUtils { } val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) - if (semanticRelation== null) + if (semanticRelation == null) return null s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) s.setSource(generateScholixResourceFromSummary(source)) @@ -274,8 +267,8 @@ object ScholixUtils { } - def findURLForPID(pidValue:List[StructuredProperty], urls:List[String]):List[(StructuredProperty, String)] = { - pidValue.map{ + def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = { + pidValue.map { p => val pv = p.getValue @@ -285,67 +278,67 @@ object ScholixUtils { } - def extractTypedIdentifierFromInstance(r:Result):List[ScholixIdentifier] = { + def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { if (r.getInstance() == null || r.getInstance().isEmpty) return List() - r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty) - .filter(i => i.getPid!= null && i.getUrl != null) + r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty) + .filter(i => i.getPid != null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList } - def resultToSummary(r:Result):ScholixSummary = { + def resultToSummary(r: Result): ScholixSummary = { val s = new ScholixSummary s.setId(r.getId) if (r.getPid == null || r.getPid.isEmpty) return null - val persistentIdentifiers:List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) + val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - if (r.isInstanceOf[Publication] ) + if (r.isInstanceOf[Publication]) s.setTypology(Typology.publication) else s.setTypology(Typology.dataset) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) - if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) { - val titles:List[String] =r.getTitle.asScala.map(t => t.getValue)(collection breakOut) + if (r.getTitle != null && r.getTitle.asScala.nonEmpty) { + val titles: List[String] = r.getTitle.asScala.map(t => t.getValue)(collection breakOut) if (titles.nonEmpty) s.setTitle(titles.asJava) else - return null + return null } - if(r.getAuthor!= null && !r.getAuthor.isEmpty) { - val authors:List[String] = r.getAuthor.asScala.map(a=> a.getFullname)(collection breakOut) + if (r.getAuthor != null && !r.getAuthor.isEmpty) { + val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname)(collection breakOut) if (authors nonEmpty) s.setAuthor(authors.asJava) } if (r.getInstance() != null) { - val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) + val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) if (dt.nonEmpty) s.setDate(dt.distinct.asJava) } - if (r.getDescription!= null && !r.getDescription.isEmpty) { - val d = r.getDescription.asScala.find(f => f!= null && f.getValue!=null) + if (r.getDescription != null && !r.getDescription.isEmpty) { + val d = r.getDescription.asScala.find(f => f != null && f.getValue != null) if (d.isDefined) s.setDescription(d.get.getValue) } - if (r.getSubject!= null && !r.getSubject.isEmpty) { - val subjects:List[SchemeValue] =r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) + if (r.getSubject != null && !r.getSubject.isEmpty) { + val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) if (subjects.nonEmpty) s.setSubject(subjects.asJava) } - if (r.getPublisher!= null) + if (r.getPublisher != null) s.setPublisher(List(r.getPublisher.getValue).asJava) - if (r.getCollectedfrom!= null && !r.getCollectedfrom.isEmpty) { - val cf:List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) + if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { + val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) if (cf.nonEmpty) s.setDatasources(cf.distinct.asJava) } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset index 05c875148..2c73183e2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset @@ -1,3 +1,3 @@ -{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} -{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} -{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} \ No newline at end of file +{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} +{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} +{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala similarity index 96% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala index a3a753a8a..7abce547f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala @@ -3,13 +3,9 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo -import eu.dnetlib.dhp.schema.oaf.{Datasource, OpenAccessRoute, Publication} -import javax.management.openmbean.OpenMBeanAttributeInfo import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} -import org.json4s import org.json4s.DefaultFormats -import eu.dnetlib.dhp.schema.common.ModelConstants import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala index 5b00e9b6f..0922f2e19 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala @@ -4,10 +4,9 @@ import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats -import org.junit.jupiter.api.Assertions.{assertNotNull, assertTrue} -import org.junit.jupiter.api.Test -import org.junit.jupiter.api.Assertions._ import org.json4s.jackson.Serialization.write +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.Test class TestPreprocess extends java.io.Serializable{ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala index 9a142d3c0..f1bd841d1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -159,6 +159,7 @@ class ResolveEntitiesTest extends Serializable { val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) + val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala similarity index 95% rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala index b90827e81..0d89cca85 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.sx.pangaea import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils import org.junit.jupiter.api.Test -import java.util.TimeZone import java.text.SimpleDateFormat import java.util.Date import scala.io.Source From 9c82d670b87219fa28d21b582b8147af957de5ea Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 17 Nov 2021 12:31:02 +0100 Subject: [PATCH 08/11] make class public in order to create javadoc --- .../eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java | 2 +- .../oa/matchers/simple/EnrichMissingPublicationDateTest.java | 2 +- .../eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java | 2 +- .../test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java | 2 +- .../eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java (100%) diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java index 45bfc785f..52e9917bb 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; @ExtendWith(MockitoExtension.class) -class UpdateMatcherTest { +public class UpdateMatcherTest { UpdateMatcher matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java index 550ded9f4..5af81a31a 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java @@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test; import eu.dnetlib.broker.objects.OaBrokerMainEntity; -class EnrichMissingPublicationDateTest { +public class EnrichMissingPublicationDateTest { final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java index b532aa9f7..d93390e4a 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java @@ -8,7 +8,7 @@ import java.util.Arrays; import org.junit.jupiter.api.Test; -class SubscriptionUtilsTest { +public class SubscriptionUtilsTest { @Test void testVerifyListSimilar() { diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java index a8bc03e31..117bdeef4 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; -class TrustUtilsTest { +public class TrustUtilsTest { private static final double THRESHOLD = 0.95; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java From bd9a43cefdd54b51a4cc542a3b8cd87696d59003 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 19 Nov 2021 09:20:43 +0100 Subject: [PATCH 09/11] Revert to 4094f2bb9ab5ad87a31b6691b8b853542d6b7685 --- .../oaf/utils/GraphCleaningFunctions.java | 4 +- .../scholix/SparkCreateActionset.scala | 0 .../scholix/SparkSaveActionSet.scala | 2 +- .../dhp/collection/CollectionUtils.scala | 0 .../dhp/datacite/AbstractRestClient.scala | 0 .../dhp/datacite/DataciteAPIImporter.scala | 0 .../DataciteToOAFTransformation.scala | 0 .../GenerateDataciteDatasetSpark.scala | 3 +- .../dnetlib/dhp/datacite/ImportDatacite.scala | 0 .../SparkDownloadUpdateDatacite.scala | 0 .../eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala | 1 - .../bio/SparkTransformBioDatabaseToOAF.scala | 12 +- .../ebi/SparkCreateBaselineDataFrame.scala | 2 +- .../sx/bio/ebi/SparkDownloadEBILinks.scala | 3 +- .../dhp/sx/bio/ebi/SparkEBILinksToOaf.scala | 5 +- .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala | 0 .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 20 ++- .../dhp/datacite/DataciteToOAFTest.scala | 0 .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 0 .../{ => actionmanager}/datacite/record.json | 0 .../broker/oa/matchers/UpdateMatcherTest.java | 2 +- .../EnrichMissingPublicationDateTest.java | 2 +- .../broker/oa/util/SubscriptionUtilsTest.java | 2 +- .../dhp/broker/oa/util/TrustUtilsTest.java | 2 +- .../doiboost/DoiBoostMappingUtil.scala | 0 .../SparkGenerateDOIBoostActionSet.scala | 0 .../doiboost/SparkGenerateDoiBoost.scala | 0 .../doiboost/crossref/Crossref2Oaf.scala | 9 +- .../doiboost/crossref/CrossrefDataset.scala | 25 +-- .../doiboost/crossref/CrossrefImporter.java | 0 .../dnetlib/doiboost/crossref/ESClient.java | 0 .../crossref/ExtractCrossrefRecords.java | 0 .../crossref/GenerateCrossrefDataset.scala | 20 ++- .../crossref/SparkMapDumpIntoOAF.scala | 4 +- .../crossref/UnpackCrtossrefEntries.scala | 9 +- .../dnetlib/doiboost/mag/MagDataModel.scala | 2 +- .../mag/SparkImportMagIntoDataset.scala | 7 +- .../doiboost/mag/SparkProcessMAG.scala | 20 ++- .../dnetlib/doiboost/orcid/ORCIDToOAF.scala | 7 +- .../orcid/SparkConvertORCIDToOAF.scala | 8 +- .../doiboost/orcid/SparkPreprocessORCID.scala | 29 ++-- .../doiboost/uw/SparkMapUnpayWallToOAF.scala | 8 +- .../dnetlib/doiboost/uw/UnpayWallToOAF.scala | 3 +- .../doiboost/DoiBoostHostedByMapTest.scala | 2 +- .../dhp}/doiboost/NormalizeDoiTest.scala | 2 +- .../crossref/CrossrefMappingTest.scala | 0 .../dnetlib/doiboost/mag/MAGMappingTest.scala | 2 +- .../orcid/MappingORCIDToOAFTest.scala | 3 +- .../doiboost/uw/UnpayWallMappingTest.scala | 4 +- .../oa/graph/hostedbymap/Aggregators.scala | 2 +- .../SparkApplyHostedByMapToDatasource.scala | 13 +- .../SparkApplyHostedByMapToResult.scala | 18 ++- .../SparkPrepareHostedByInfoToApply.scala | 35 +++-- .../hostedbymap/SparkProduceHostedByMap.scala | 109 ++++++------- .../raw/CopyHdfsOafSparkApplication.scala | 10 +- .../resolution/SparkResolveEntities.scala | 25 +-- .../resolution/SparkResolveRelation.scala | 2 +- .../sx/graphimport/SparkDataciteToOAF.scala | 31 ++++ .../graph/SparkConvertDatasetToJsonRDD.scala | 10 +- .../sx/graph/SparkConvertObjectToJson.scala | 10 +- .../sx/graph/SparkConvertRDDtoDataset.scala | 27 ++-- .../dhp/sx/graph/SparkCreateInputGraph.scala | 27 ++-- .../dhp/sx/graph/SparkCreateScholix.scala | 28 ++-- .../sx/graph/SparkCreateSummaryObject.scala | 12 +- .../dhp/sx/graph/pangaea/PangaeaUtils.scala | 1 - .../SparkGeneratePanagaeaDataset.scala | 18 ++- .../dhp/sx/graph/scholix/ScholixUtils.scala | 143 +++++++++--------- .../dhp/oa/graph/hostedbymap/TestApply.scala | 0 .../oa/graph/hostedbymap/TestPrepare.scala | 4 + .../oa/graph/hostedbymap/TestPreprocess.scala | 5 +- .../resolution/ResolveEntitiesTest.scala | 1 - .../sx/graph/scholix/ScholixGraphTest.scala | 0 .../dhp/sx/pangaea/PangaeaTransformTest.scala | 1 + .../dnetlib/dhp/oa/graph/resolution/dataset | 6 +- 74 files changed, 431 insertions(+), 331 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala (96%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/collection/CollectionUtils.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/AbstractRestClient.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala (95%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/ImportDatacite.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala (99%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala (73%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala (98%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala (98%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala (93%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala (100%) rename dhp-workflows/dhp-aggregation/src/main/{scala => java}/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala (96%) rename dhp-workflows/dhp-aggregation/src/test/{scala => java}/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala (100%) rename dhp-workflows/dhp-aggregation/src/test/{scala => java}/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala (100%) rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/{ => actionmanager}/datacite/record.json (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala (99%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala (77%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/CrossrefImporter.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/ESClient.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala (73%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala (96%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala (88%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/mag/MagDataModel.scala (100%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala (98%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala (91%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala (98%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala (84%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala (67%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala (80%) rename dhp-workflows/dhp-doiboost/src/main/{scala => java}/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala (98%) rename dhp-workflows/dhp-doiboost/src/test/{scala/eu/dnetlib => java/eu/dnetlib/dhp}/doiboost/DoiBoostHostedByMapTest.scala (98%) rename dhp-workflows/dhp-doiboost/src/test/{scala/eu/dnetlib => java/eu/dnetlib/dhp}/doiboost/NormalizeDoiTest.scala (96%) rename dhp-workflows/dhp-doiboost/src/test/{scala => java}/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala (100%) rename dhp-workflows/dhp-doiboost/src/test/{scala => java}/eu/dnetlib/doiboost/mag/MAGMappingTest.scala (100%) rename dhp-workflows/dhp-doiboost/src/test/{scala => java}/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala (99%) rename dhp-workflows/dhp-doiboost/src/test/{scala => java}/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala (81%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala (85%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala (74%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala (61%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala (88%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala (79%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala (99%) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala (69%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala (83%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala (62%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala (76%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala (76%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala (68%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala (99%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala (83%) rename dhp-workflows/dhp-graph-mapper/src/main/{scala => java}/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala (61%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala (96%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala (98%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala (99%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala (100%) rename dhp-workflows/dhp-graph-mapper/src/test/{scala => java}/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala (95%) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 43413b311..d8b1cded8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -27,8 +27,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; - public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d"; - public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; + public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala index 62d219b57..1df7ea3fb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.actionmanager.scholix import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, Oaf, Publication, Software, OtherResearchProduct, Relation} +import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapred.SequenceFileOutputFormat diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala similarity index 95% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index 3c8caa485..a63627d1c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -3,7 +3,8 @@ package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations -import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH} +import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH +import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} import eu.dnetlib.dhp.schema.oaf.Oaf diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index 853b24862..70dcc0184 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -7,7 +7,6 @@ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.{compact, parse, render} import collection.JavaConverters._ - object BioDBToOAF { case class EBILinkItem(id: Long, links: String) {} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala similarity index 73% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index fcceacd44..8ae8285e3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -1,9 +1,9 @@ package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.schema.oaf.Oaf -import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved +import BioDBToOAF.ScholixResolved +import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} @@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF { import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "PDB" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "SCHOLIX" => - spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "CROSSREF_LINKS" => - spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 660a26a6c..17d21f19c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.oaf.Result -import eu.dnetlib.dhp.sx.bio.pubmed._ +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala index 18e39387f..eab6b1dc6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala @@ -1,8 +1,9 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem +import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal import org.apache.commons.io.IOUtils import org.apache.http.client.config.RequestConfig import org.apache.http.client.methods.HttpGet diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index 12af4824b..8da617ca0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -1,10 +1,11 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem +import BioDBToOAF.EBILinkItem +import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -37,7 +38,7 @@ object SparkEBILinksToOaf { ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) - .flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null) + .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null) .write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index d5d40ecfe..ecef32202 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -4,9 +4,10 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ +import scala.collection.JavaConverters._ import java.util.regex.Pattern -import collection.JavaConverters._ + /** * */ @@ -21,10 +22,10 @@ object PubMedToOaf { val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + /** * Cleaning the DOI Applying regex in order to * remove doi starting with URL - * * @param doi input DOI * @return cleaned DOI */ @@ -48,7 +49,7 @@ object PubMedToOaf { * starting from OAF instanceType value * * @param cobjQualifier OAF instance type - * @param vocabularies All dnet vocabularies + * @param vocabularies All dnet vocabularies * @return the correct instance */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { @@ -64,7 +65,7 @@ object PubMedToOaf { } /** - * Mapping the Pubmedjournal info into the OAF Journale + * Mapping the Pubmedjournal info into the OAF Journale * * @param j the pubmedJournal * @return the OAF Journal @@ -90,8 +91,9 @@ object PubMedToOaf { * Find vocabulary term into synonyms and term in the vocabulary * * @param vocabularyName the input vocabulary name - * @param vocabularies all the vocabularies - * @param term the term to find + * @param vocabularies all the vocabularies + * @param term the term to find + * * @return the cleaned term value */ def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { @@ -102,9 +104,10 @@ object PubMedToOaf { /** - * Map the Pubmed Article into the OAF instance + * Map the Pubmed Article into the OAF instance * - * @param article the pubmed articles + * + * @param article the pubmed articles * @param vocabularies the vocabularies * @return The OAF instance if the mapping did not fail */ @@ -182,6 +185,7 @@ object PubMedToOaf { //-------------------------------------------------------------------------------------- + // RESULT MAPPING //-------------------------------------------------------------------------------------- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/datacite/record.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/datacite/record.json rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record.json diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java index 52e9917bb..45bfc785f 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; @ExtendWith(MockitoExtension.class) -public class UpdateMatcherTest { +class UpdateMatcherTest { UpdateMatcher matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java index 5af81a31a..550ded9f4 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java @@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test; import eu.dnetlib.broker.objects.OaBrokerMainEntity; -public class EnrichMissingPublicationDateTest { +class EnrichMissingPublicationDateTest { final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java index d93390e4a..b532aa9f7 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java @@ -8,7 +8,7 @@ import java.util.Arrays; import org.junit.jupiter.api.Test; -public class SubscriptionUtilsTest { +class SubscriptionUtilsTest { @Test void testVerifyListSimilar() { diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java index 117bdeef4..a8bc03e31 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; -public class TrustUtilsTest { +class TrustUtilsTest { private static final double THRESHOLD = 0.95; diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index edca4a180..1b1c850ba 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -4,19 +4,20 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.utils.DHPUtils -import eu.dnetlib.doiboost.DoiBoostMappingUtil -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ +import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _} import org.apache.commons.lang.StringUtils import org.json4s import org.json4s.DefaultFormats -import org.json4s.JsonAST._ +import org.json4s.JsonAST.{JValue, _} import org.json4s.jackson.JsonMethods._ import org.slf4j.{Logger, LoggerFactory} -import java.util import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex +import java.util + +import eu.dnetlib.doiboost.DoiBoostMappingUtil case class CrossrefDT(doi: String, json:String, timestamp: Long) {} diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala similarity index 77% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 6a1c701af..159b817c7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.Aggregator -import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -17,12 +17,12 @@ object CrossrefDataset { val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) - def to_item(input: String): CrossrefDT = { + def to_item(input:String):CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val ts: Long = (json \ "indexed" \ "timestamp").extract[Long] - val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) CrossrefDT(doi, input, ts) } @@ -30,6 +30,7 @@ object CrossrefDataset { def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) parser.parseArgument(args) @@ -53,7 +54,7 @@ object CrossrefDataset { return b - if (a.timestamp > b.timestamp) { + if(a.timestamp >b.timestamp) { return a } b @@ -65,7 +66,7 @@ object CrossrefDataset { if (a == null) return b - if (a.timestamp > b.timestamp) { + if(a.timestamp >b.timestamp) { return a } b @@ -78,20 +79,20 @@ object CrossrefDataset { override def finish(reduction: CrossrefDT): CrossrefDT = reduction } - val workingPath: String = parser.get("workingPath") + val workingPath:String = parser.get("workingPath") - val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] + val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] val update = - spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) - .map(i => CrossrefImporter.decompressBlob(i._2.toString)) - .map(i => to_item(i))) + spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) + .map(i =>CrossrefImporter.decompressBlob(i._2.toString)) + .map(i =>to_item(i))) main_ds.union(update).groupByKey(_.doi) .agg(crossrefAggregator.toColumn) - .map(s => s._2) + .map(s=>s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefImporter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/CrossrefImporter.java rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ESClient.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ESClient.java rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala similarity index 73% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala index 6d03abc25..526ff7b3a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/GenerateCrossrefDataset.scala @@ -2,12 +2,17 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.doiboost.DoiBoostMappingUtil +import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item +import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass +import org.apache.hadoop.io.{IntWritable, Text} +import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats -import org.json4s.jackson.JsonMethods.parse +import org.json4s.JsonAST.JArray +import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.slf4j.{Logger, LoggerFactory} import scala.io.Source @@ -19,10 +24,11 @@ object GenerateCrossrefDataset { implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] + def crossrefElement(meta: String): CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(meta) - val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) + val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String]) val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long] CrossrefDT(doi, meta, timestamp) @@ -45,14 +51,14 @@ object GenerateCrossrefDataset { import spark.implicits._ - val tmp: RDD[String] = sc.textFile(sourcePath, 6000) + val tmp : RDD[String] = sc.textFile(sourcePath,6000) spark.createDataset(tmp) .map(entry => crossrefElement(entry)) .write.mode(SaveMode.Overwrite).save(targetPath) - // .map(meta => crossrefElement(meta)) - // .toDS.as[CrossrefDT] - // .write.mode(SaveMode.Overwrite).save(targetPath) +// .map(meta => crossrefElement(meta)) +// .toDS.as[CrossrefDT] +// .write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala similarity index 96% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index fa55b9fb9..c65916610 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -4,8 +4,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset} import org.apache.commons.io.IOUtils + import org.apache.spark.SparkConf -import org.apache.spark.sql._ + +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala similarity index 88% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala index 191c4587e..95ecb568b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/UnpackCrtossrefEntries.scala @@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.hadoop.io.compress.GzipCodec -import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST.JArray @@ -17,7 +17,9 @@ object UnpackCrtossrefEntries { val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) - def extractDump(input: String): List[String] = { + + + def extractDump(input:String):List[String] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -28,6 +30,7 @@ object UnpackCrtossrefEntries { } + def main(args: Array[String]): Unit = { val conf = new SparkConf val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) @@ -42,7 +45,7 @@ object UnpackCrtossrefEntries { .getOrCreate() val sc: SparkContext = spark.sparkContext - sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2)) + sc.wholeTextFiles(sourcePath,6000).flatMap(d =>extractDump(d._2)) .saveAsTextFile(targetPath, classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 0a6fa00f0..fd9629024 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.doiboost.DoiBoostMappingUtil -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import scala.collection.JavaConverters._ import scala.collection.mutable diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala index d25a4893f..a68d0bb2d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkImportMagIntoDataset.scala @@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.types._ import org.apache.spark.sql.{SaveMode, SparkSession} +import org.apache.spark.sql.types._ import org.slf4j.{Logger, LoggerFactory} object SparkImportMagIntoDataset { @@ -24,13 +24,13 @@ object SparkImportMagIntoDataset { "Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")), "Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), + "ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")), "ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), "EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")), "FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")), "FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")), "FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), - "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")), + "Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")), "PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")), "PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")), "PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")), @@ -75,6 +75,7 @@ object SparkImportMagIntoDataset { .master(parser.get("master")).getOrCreate() + stream.foreach { case (k, v) => val s: StructType = getSchema(k) val df = spark.read diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala similarity index 91% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 932725446..016279787 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -5,16 +5,19 @@ import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql.functions.{col, collect_list, struct} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions._ import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} + import scala.collection.JavaConverters._ + object SparkProcessMAG { - def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { + def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={ d.where(col("Doi").isNotNull) .groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING) - .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) + .reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2)) .map(_._2)(Encoders.product[MagPapers]) .map(mp => { new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), @@ -95,13 +98,13 @@ object SparkProcessMAG { var magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingPath/merge_step_2").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] + .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] val conference = spark.read.load(s"$sourcePath/ConferenceInstances") - .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate") + .select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" ) val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci"))) - .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] + .select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance] magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left") @@ -119,7 +122,7 @@ object SparkProcessMAG { magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left") .map(item => ConversionUtil.updatePubsWithDescription(item) - ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") + ).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4") logger.info("Phase 7) Enrich Publication with FieldOfStudy") @@ -145,10 +148,11 @@ object SparkProcessMAG { spark.read.load(s"$workingPath/mag_publication").as[Publication] .filter(p => p.getId == null) .groupByKey(p => p.getId) - .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) + .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 11031f9ca..1cd3f7028 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -4,16 +4,17 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} +import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import org.apache.commons.lang.StringUtils +import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.JavaConverters._ import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ import org.json4s.jackson.JsonMethods._ -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){} diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala similarity index 84% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala index 1b189e296..fa4a93e00 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkConvertORCIDToOAF.scala @@ -11,10 +11,10 @@ object SparkConvertORCIDToOAF { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = { + def run(spark:SparkSession, workingPath:String, targetPath:String) :Unit = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] import spark.implicits._ - val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] + val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem] logger.info("Converting ORCID to OAF") dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath) @@ -35,8 +35,8 @@ object SparkConvertORCIDToOAF { val workingPath = parser.get("workingPath") val targetPath = parser.get("targetPath") - run(spark, workingPath, targetPath) + run(spark,workingPath, targetPath) } -} +} \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala similarity index 67% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala index 153be5dd1..31f331912 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SparkPreprocessORCID.scala @@ -1,45 +1,48 @@ package eu.dnetlib.doiboost.orcid +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.oa.merge.AuthorMerger import eu.dnetlib.dhp.schema.oaf.Publication +import eu.dnetlib.dhp.schema.orcid.OrcidDOI import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.functions.{col, collect_list} -import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkPreprocessORCID { val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass) - def fixORCIDItem(item: ORCIDItem): ORCIDItem = { - ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) + def fixORCIDItem(item :ORCIDItem):ORCIDItem = { + ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList) } - def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = { + def run(spark:SparkSession,sourcePath:String,workingPath:String):Unit = { import spark.implicits._ implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication] - val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s)) + val inputRDD:RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s)) spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author") - val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null) + val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null) spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works") - val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] + val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor] - val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] + val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork] works.joinWith(authors, authors("oid").equalTo(works("oid"))) - .map(i => { + .map(i =>{ val doi = i._1.doi val author = i._2 - (doi, author) - }).groupBy(col("_1").alias("doi")) + (doi, author) + }).groupBy(col("_1").alias("doi")) .agg(collect_list(col("_2")).alias("authors")).as[ORCIDItem] .map(s => fixORCIDItem(s)) .write.mode(SaveMode.Overwrite).save(s"$workingPath/orcidworksWithAuthor") @@ -64,4 +67,4 @@ object SparkPreprocessORCID { } -} +} \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala similarity index 80% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala index 70290018d..4530926f1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/SparkMapUnpayWallToOAF.scala @@ -1,14 +1,16 @@ package eu.dnetlib.doiboost.uw import eu.dnetlib.dhp.application.ArgumentApplicationParser + import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + object SparkMapUnpayWallToOAF { def main(args: Array[String]): Unit = { @@ -30,11 +32,11 @@ object SparkMapUnpayWallToOAF { val sourcePath = parser.get("sourcePath") val targetPath = parser.get("targetPath") - val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath") + val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath") logger.info("Converting UnpayWall to OAF") - val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication] + val d:Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication] d.write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala index bf5694965..c8324cde1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/uw/UnpayWallToOAF.scala @@ -4,13 +4,14 @@ import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil -import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ +import eu.dnetlib.doiboost.DoiBoostMappingUtil._ +import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala similarity index 98% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala index 049ac37f4..4912648be 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/DoiBoostHostedByMapTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/DoiBoostHostedByMapTest.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.doiboost +package eu.dnetlib.dhp.doiboost import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset} import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType} diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala similarity index 96% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala index bdf845f19..a9a841ee9 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/NormalizeDoiTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/NormalizeDoiTest.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.doiboost +package eu.dnetlib.dhp.doiboost import eu.dnetlib.doiboost.DoiBoostMappingUtil import org.junit.jupiter.api.Test diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala index 7403e103e..46d4ec08d 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/mag/MAGMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/mag/MAGMappingTest.scala @@ -3,9 +3,9 @@ package eu.dnetlib.doiboost.mag import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, SparkSession} import org.codehaus.jackson.map.ObjectMapper -import org.json4s.DefaultFormats import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test +import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} import java.sql.Timestamp diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala similarity index 99% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala index a5ce6296c..b484dc087 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -10,9 +10,10 @@ import org.junit.jupiter.api.io.TempDir import org.slf4j.{Logger, LoggerFactory} import java.nio.file.Path -import scala.collection.JavaConversions._ import scala.io.Source +import scala.collection.JavaConversions._ + class MappingORCIDToOAFTest { val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val mapper = new ObjectMapper() diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala similarity index 100% rename from dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala rename to dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala index 012ed3da0..fa696fffc 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/uw/UnpayWallMappingTest.scala @@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.uw import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute -import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test -import org.slf4j.{Logger, LoggerFactory} import scala.io.Source +import org.junit.jupiter.api.Assertions._ +import org.slf4j.{Logger, LoggerFactory} class UnpayWallMappingTest { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala index ad4e1c96e..ce383292c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/Aggregators.scala @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo -import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn} +import org.apache.spark.sql.expressions.Aggregator case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala similarity index 81% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala index 38af3eee4..1b18ba3ae 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToDatasource.scala @@ -2,12 +2,13 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.Datasource +import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} @@ -51,18 +52,18 @@ object SparkApplyHostedByMapToDatasource { val mapper = new ObjectMapper() - val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") + val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource") .map(r => mapper.readValue(r, classOf[Datasource])) - val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath) + val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath) .map(ei => mapper.readValue(ei, classOf[EntityInfo]))) - applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) + applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression", "gzip") + .option("compression","gzip") .text(graphPath + "/datasource") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala similarity index 85% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index d360da2e9..0e047d016 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -5,13 +5,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils -import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication} +import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} + import scala.collection.JavaConverters._ + + object SparkApplyHostedByMapToResult { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { @@ -36,7 +39,6 @@ object SparkApplyHostedByMapToResult { p })(Encoders.bean(classOf[Publication])) } - def main(args: Array[String]): Unit = { @@ -65,18 +67,18 @@ object SparkApplyHostedByMapToResult { implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) val mapper = new ObjectMapper() - val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication") + val pubs : Dataset[Publication] = spark.read.textFile(graphPath + "/publication") .map(r => mapper.readValue(r, classOf[Publication])) - val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) - .map(ei => mapper.readValue(ei, classOf[EntityInfo])) + val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath) + .map(ei => mapper.readValue(ei, classOf[EntityInfo])) - applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath) + applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath) spark.read.textFile(outputPath) .write .mode(SaveMode.Overwrite) - .option("compression", "gzip") + .option("compression","gzip") .text(graphPath + "/publication") } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala similarity index 74% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala index 87e203e4b..b7a7d352f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkPrepareHostedByInfoToApply.scala @@ -3,58 +3,61 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo + import eu.dnetlib.dhp.schema.oaf.{Journal, Publication} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} + + object SparkPrepareHostedByInfoToApply { implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) - def getList(id: String, j: Journal, name: String): List[EntityInfo] = { - var lst: List[EntityInfo] = List() + def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = { + var lst:List[EntityInfo] = List() - if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) { + if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){ lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst } - if (j.getIssnOnline != null && !j.getIssnOnline.equals("")) { + if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){ lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst } - if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")) { + if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){ lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst } lst } - def prepareResultInfo(spark: SparkSession, publicationPath: String): Dataset[EntityInfo] = { + def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = { implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication]) val mapper = new ObjectMapper() - val dd: Dataset[Publication] = spark.read.textFile(publicationPath) + val dd : Dataset[Publication] = spark.read.textFile(publicationPath) .map(r => mapper.readValue(r, classOf[Publication])) - dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, "")) + dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, "")) } - def toEntityInfo(input: String): EntityInfo = { + def toEntityInfo(input:String): EntityInfo = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]] + val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]] toEntityItem(c.keys.head, c.values.head) } - def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = { + def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = { EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess) @@ -64,7 +67,7 @@ object SparkPrepareHostedByInfoToApply { Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left") .map(t2 => { val res: EntityInfo = t2._1 - if (t2._2 != null) { + if(t2._2 != null ){ val ds = t2._2 res.setHostedById(ds.getId) res.setOpenAccess(ds.getOpenAccess) @@ -104,10 +107,10 @@ object SparkPrepareHostedByInfoToApply { //STEP1: read the hostedbymap and transform it in EntityInfo - val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) + val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo) - //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) - val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") + //STEP2: create association (publication, issn), (publication, eissn), (publication, lissn) + val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication") //STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just //one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala index 6dfe35623..1ee1d5d1a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkProduceHostedByMap.scala @@ -1,39 +1,41 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap -import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel} import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.commons.io.IOUtils -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.json4s.DefaultFormats import org.slf4j.{Logger, LoggerFactory} - +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FileSystem +import org.apache.hadoop.fs.Path import java.io.PrintWriter +import org.apache.hadoop.io.compress.GzipCodec + + object SparkProduceHostedByMap { implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]) - def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = { + def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)) : HostedByItemType = { val openaire: HostedByInfo = input._1._1 val doaj: HostedByInfo = input._1._2 val gold: HostedByInfo = input._2 val isOpenAccess: Boolean = doaj == null && gold == null openaire.journal_id match { - case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) - case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) - case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) + case Constants.ISSN => HostedByItemType(openaire.id, openaire.officialname, openaire.journal_id, "", "", isOpenAccess) + case Constants.EISSN => HostedByItemType(openaire.id, openaire.officialname, "", openaire.journal_id, "", isOpenAccess) + case Constants.ISSNL => HostedByItemType(openaire.id, openaire.officialname, "", "", openaire.journal_id, isOpenAccess) // catch the default with a variable so you can print it - case whoa => null + case whoa => null } } @@ -42,7 +44,7 @@ object SparkProduceHostedByMap { implicit val formats = org.json4s.DefaultFormats - val map: Map[String, HostedByItemType] = Map(input._1 -> input._2) + val map: Map [String, HostedByItemType] = Map (input._1 -> input._2 ) Serialization.write(map) @@ -50,33 +52,34 @@ object SparkProduceHostedByMap { } - def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { - if (issn != null) { - if (eissn != null) { - if (issnl != null) { - HostedByItemType(id, officialname, issn, eissn, issnl, oa) - } else { - HostedByItemType(id, officialname, issn, eissn, "", oa) + + def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = { + if(issn != null){ + if(eissn != null){ + if(issnl != null){ + HostedByItemType(id, officialname, issn, eissn, issnl , oa) + }else{ + HostedByItemType(id, officialname, issn, eissn, "" , oa) } - } else { - if (issnl != null) { - HostedByItemType(id, officialname, issn, "", issnl, oa) - } else { - HostedByItemType(id, officialname, issn, "", "", oa) + }else{ + if(issnl != null){ + HostedByItemType(id, officialname, issn, "", issnl , oa) + }else{ + HostedByItemType(id, officialname, issn, "", "" , oa) } } - } else { - if (eissn != null) { - if (issnl != null) { - HostedByItemType(id, officialname, "", eissn, issnl, oa) - } else { - HostedByItemType(id, officialname, "", eissn, "", oa) + }else{ + if(eissn != null){ + if(issnl != null){ + HostedByItemType(id, officialname, "", eissn, issnl , oa) + }else{ + HostedByItemType(id, officialname, "", eissn, "" , oa) } - } else { - if (issnl != null) { - HostedByItemType(id, officialname, "", "", issnl, oa) - } else { - HostedByItemType("", "", "", "", "", oa) + }else{ + if(issnl != null){ + HostedByItemType(id, officialname, "", "", issnl , oa) + }else{ + HostedByItemType("", "", "", "", "" , oa) } } } @@ -87,10 +90,10 @@ object SparkProduceHostedByMap { return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false) } - HostedByItemType("", "", "", "", "", false) + HostedByItemType("","","","","",false) } - def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { + def oaHostedByDataset(spark:SparkSession, datasourcePath : String) : Dataset[HostedByItemType] = { import spark.implicits._ @@ -99,10 +102,10 @@ object SparkProduceHostedByMap { implicit var encoderD = Encoders.kryo[Datasource] - val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath) + val dd : Dataset[Datasource] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[Datasource])) - dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) + dd.map{ddt => oaToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) } @@ -112,17 +115,17 @@ object SparkProduceHostedByMap { } - def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { + def goldHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel] val mapper = new ObjectMapper() - val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) + val dd : Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[UnibiGoldModel])) - dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) + dd.map{ddt => goldToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) } @@ -131,40 +134,41 @@ object SparkProduceHostedByMap { return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true) } - def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { + def doajHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = { import spark.implicits._ implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel] val mapper = new ObjectMapper() - val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath) + val dd : Dataset[DOAJModel] = spark.read.textFile(datasourcePath) .map(r => mapper.readValue(r, classOf[DOAJModel])) - dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals(""))) + dd.map{ddt => doajToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals(""))) } def toList(input: HostedByItemType): List[(String, HostedByItemType)] = { - var lst: List[(String, HostedByItemType)] = List() - if (!input.issn.equals("")) { + var lst : List[(String, HostedByItemType)] = List() + if(!input.issn.equals("")){ lst = (input.issn, input) :: lst } - if (!input.eissn.equals("")) { + if(!input.eissn.equals("")){ lst = (input.eissn, input) :: lst } - if (!input.lissn.equals("")) { + if(!input.lissn.equals("")){ lst = (input.lissn, input) :: lst } lst } - def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { + + def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = { val conf = new Configuration() conf.set("fs.defaultFS", hdfsNameNode) - val fs = FileSystem.get(conf) + val fs= FileSystem.get(conf) val output = fs.create(new Path(outputPath)) val writer = new PrintWriter(output) try { @@ -178,6 +182,7 @@ object SparkProduceHostedByMap { } + def main(args: Array[String]): Unit = { val logger: Logger = LoggerFactory.getLogger(getClass) @@ -208,7 +213,7 @@ object SparkProduceHostedByMap { .union(doajHostedByDataset(spark, workingDirPath + "/doaj.json")) .flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|")) .map(hbi => toHostedByMap(hbi))(Encoders.STRING) - .rdd.saveAsTextFile(outputPath, classOf[GzipCodec]) + .rdd.saveAsTextFile(outputPath , classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala similarity index 88% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala index 0179cc266..c7ad1890d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -4,14 +4,20 @@ import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.ModelSupport +import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.io.IOUtils +import org.apache.commons.lang3.StringUtils +import org.apache.http.client.methods.HttpGet +import org.apache.http.impl.client.HttpClients import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory -import scala.io.Source import scala.collection.JavaConverters._ +import scala.io.Source + object CopyHdfsOafSparkApplication { def main(args: Array[String]): Unit = { @@ -53,7 +59,7 @@ object CopyHdfsOafSparkApplication { if (validPaths.nonEmpty) { val oaf = spark.read.load(validPaths: _*).as[Oaf] val mapper = new ObjectMapper() - val l = ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList + val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList l.foreach( e => oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala similarity index 79% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala index 6b4a501d6..316b8afed 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.schema.common.EntityType import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils @@ -13,7 +14,7 @@ import org.slf4j.{Logger, LoggerFactory} object SparkResolveEntities { val mapper = new ObjectMapper() - val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct) + val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct) def main(args: Array[String]): Unit = { val log: Logger = LoggerFactory.getLogger(getClass) @@ -50,10 +51,10 @@ object SparkResolveEntities { fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e")) } - } +} - def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { +def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ @@ -70,22 +71,22 @@ object SparkResolveEntities { } - def deserializeObject(input: String, entity: EntityType): Result = { + def deserializeObject(input:String, entity:EntityType ) :Result = { - entity match { - case EntityType.publication => mapper.readValue(input, classOf[Publication]) - case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) - case EntityType.software => mapper.readValue(input, classOf[Software]) - case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct]) - } + entity match { + case EntityType.publication => mapper.readValue(input, classOf[Publication]) + case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) + case EntityType.software=> mapper.readValue(input, classOf[Software]) + case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct]) + } } - def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String) = { + def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = { implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) import spark.implicits._ - val re: Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] + val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] entities.foreach { e => diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala index c7f9b2d0e..cd517dd5e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.HdfsSupport -import eu.dnetlib.dhp.schema.oaf.Relation +import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileSystem, Path} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala new file mode 100644 index 000000000..9e905d806 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/sx/graphimport/SparkDataciteToOAF.scala @@ -0,0 +1,31 @@ +package eu.dnetlib.dhp.oa.sx.graphimport + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +object SparkDataciteToOAF { + + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + import spark.implicits._ + + + val sc = spark.sparkContext + + val inputPath = parser.get("inputPath") + + + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala similarity index 69% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala index 9d16cf907..3ee0c7dd6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertDatasetToJsonRDD.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Result +import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf @@ -29,13 +29,13 @@ object SparkConvertDatasetToJsonRDD { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val resultObject = List("publication", "dataset", "software", "otherResearchProduct") + val resultObject = List("publication","dataset","software", "otherResearchProduct") val mapper = new ObjectMapper() - implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - resultObject.foreach { item => - spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) + resultObject.foreach{item => + spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index cc1b97fd6..846ac37af 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import org.apache.commons.io.IOUtils -import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} +import org.apache.hadoop.io.compress._ object SparkConvertObjectToJson { @@ -32,8 +32,8 @@ object SparkConvertObjectToJson { log.info(s"objectType -> $objectType") - implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] - implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val scholixEncoder :Encoder[Scholix]= Encoders.kryo[Scholix] + implicit val summaryEncoder :Encoder[ScholixSummary]= Encoders.kryo[ScholixSummary] val mapper = new ObjectMapper @@ -42,11 +42,11 @@ object SparkConvertObjectToJson { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(6000).saveAsTextFile(targetPath, classOf[GzipCodec]) case "summary" => log.info("Serialize Summary") val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] - d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) + d.map(s => mapper.writeValueAsString(s))(Encoders.STRING).rdd.repartition(1000).saveAsTextFile(targetPath, classOf[GzipCodec]) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala similarity index 62% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index 2eb5e3a35..4b82fe645 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -2,12 +2,11 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} - object SparkConvertRDDtoDataset { def main(args: Array[String]): Unit = { @@ -32,39 +31,39 @@ object SparkConvertRDDtoDataset { val entityPath = s"$t/entities" val relPath = s"$t/relation" val mapper = new ObjectMapper() - implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) - implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) - implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) - implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) + implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) + implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct]) + implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) log.info("Converting dataset") - val rddDataset = spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) + val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset])) spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset") log.info("Converting publication") - val rddPublication = spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) + val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication])) spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication") log.info("Converting software") - val rddSoftware = spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) + val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software])) spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software") log.info("Converting otherresearchproduct") - val rddOtherResearchProduct = spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) + val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct])) spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct") log.info("Converting Relation") - val relationSemanticFilter = List("cites", "iscitedby", "merges", "ismergedin") + val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin") - val rddRelation = spark.sparkContext.textFile(s"$sourcePath/relation") + val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation") .map(s => mapper.readValue(s, classOf[Relation])) - .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index b6f678967..350b00c5e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -1,12 +1,14 @@ package eu.dnetlib.dhp.sx.graph import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} +import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} + + object SparkCreateInputGraph { def main(args: Array[String]): Unit = { @@ -31,7 +33,7 @@ object SparkCreateInputGraph { ) - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software]) @@ -39,13 +41,16 @@ object SparkCreateInputGraph { implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + + + val sourcePath = parser.get("sourcePath") log.info(s"sourcePath -> $sourcePath") val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] + val oafDs:Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf] log.info("Extract Publication") @@ -65,27 +70,27 @@ object SparkCreateInputGraph { resultObject.foreach { r => log.info(s"Make ${r._1} unique") - makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2) + makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/preprocess/${r._1}",spark, r._2) } } - def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = { + def extractEntities[T <: Oaf ](oafDs:Dataset[Oaf], targetPath:String, clazz:Class[T], log:Logger) :Unit = { - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) log.info(s"Extract ${clazz.getSimpleName}") oafDs.filter(o => o.isInstanceOf[T]).map(p => p.asInstanceOf[T]).write.mode(SaveMode.Overwrite).save(targetPath) } - def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = { + def makeDatasetUnique[T <: Result ](sourcePath:String, targetPath:String, spark:SparkSession, clazz:Class[T]) :Unit = { import spark.implicits._ - implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) + implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz) - val ds: Dataset[T] = spark.read.load(sourcePath).as[T] + val ds:Dataset[T] = spark.read.load(sourcePath).as[T] - ds.groupByKey(_.getId).reduceGroups { (x, y) => + ds.groupByKey(_.getId).reduceGroups{(x,y) => x.mergeFrom(y) x }.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index 9930c57af..e4fcd2782 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.functions.count -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkCreateScholix { @@ -42,7 +42,7 @@ object SparkCreateScholix { val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation] - .filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + .filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder)) val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary] @@ -51,54 +51,54 @@ object SparkCreateScholix { relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => - if (input._1 != null && input._2 != null) { + if (input._1!= null && input._2!= null) { val rel: Relation = input._1._2 val source: ScholixSummary = input._2._2 (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) } - else null + else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) - .filter(r => r != null) + .filter(r => r!= null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Scholix), (String, ScholixSummary)) => - if (input._2 == null) { + if (input._2== null) { null } else { val s: Scholix = input._1._2 val target: ScholixSummary = input._2._2 ScholixUtils.generateCompleteScholix(s, target) } - }.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") + }.filter(s => s!= null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse") val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix] scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix] - .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) + .map(s=> (s.getIdentifier,s))(Encoders.tuple(Encoders.STRING, scholixEncoder)) .groupByKey(_._1) .agg(ScholixUtils.scholixAggregator.toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix") - val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] + val scholix_final:Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix] - val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)] + val stats:Dataset[(String,String,Long)]= scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String,String,Long)] stats - .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0)) + .map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0 )) .groupByKey(_.id) - .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication)) + .reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset+b.relatedDataset, a.relatedPublication+b.relatedPublication)) .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities") - val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) + val relatedEntitiesDS:Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication>0 || r.relatedDataset > 0) - relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i => + relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map{i => val re = i._1 val sum = i._2._2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala similarity index 68% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala index 4274cae5a..0970375f5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.sql._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} object SparkCreateSummaryObject { @@ -28,15 +28,15 @@ object SparkCreateSummaryObject { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result] + implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] - val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) + val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r=>r.getDataInfo== null || r.getDataInfo.getDeletedbyinference== false) - ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath) + ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala index c70397d04..193512474 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala @@ -5,7 +5,6 @@ import org.apache.spark.sql.{Encoder, Encoders} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse - import java.util.regex.Pattern import scala.language.postfixOps import scala.xml.{Elem, Node, XML} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala similarity index 83% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala index f1a4553ea..79c75d6df 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala @@ -2,12 +2,13 @@ package eu.dnetlib.dhp.sx.graph.pangaea import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} -import scala.io.Source import scala.collection.JavaConverters._ +import scala.io.Source + object SparkGeneratePanagaeaDataset { @@ -27,17 +28,17 @@ object SparkGeneratePanagaeaDataset { parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}")) logger.info("Converting sequential file into Dataset") - val sc: SparkContext = spark.sparkContext + val sc:SparkContext = spark.sparkContext - val workingPath: String = parser.get("workingPath") + val workingPath:String = parser.get("workingPath") implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel] - val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) + val inputRDD:RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s)) spark.createDataset(inputRDD).as[PangaeaDataModel] - .map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) - .groupByKey(_._1)(Encoders.STRING) + .map(s => (s.identifier,s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders)) + .groupByKey(_._1)(Encoders.STRING) .agg(PangaeaUtils.getDatasetAggregator().toColumn) .map(s => s._2) .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") @@ -45,4 +46,7 @@ object SparkGeneratePanagaeaDataset { } + + + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala similarity index 61% rename from dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala rename to dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index 7b1ddbb8f..93c554e04 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.sx.graph.scholix + import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.sx.scholix._ import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} @@ -10,23 +11,22 @@ import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse -import scala.io.Source import scala.collection.JavaConverters._ - +import scala.io.Source +import scala.language.postfixOps object ScholixUtils { val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" - val DATE_RELATION_KEY: String = "RelationDate" + val DATE_RELATION_KEY:String = "RelationDate" + case class RelationVocabulary(original:String, inverse:String){} - case class RelationVocabulary(original: String, inverse: String) {} + case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){} - case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} - - val relations: Map[String, RelationVocabulary] = { - val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString + val relations:Map[String, RelationVocabulary] = { + val input =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) @@ -35,12 +35,12 @@ object ScholixUtils { } - def extractRelationDate(relation: Relation): String = { + def extractRelationDate(relation: Relation):String = { - if (relation.getProperties == null || !relation.getProperties.isEmpty) + if (relation.getProperties== null || !relation.getProperties.isEmpty) null else { - val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) + val date =relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue) if (date.isDefined) date.get else @@ -48,9 +48,9 @@ object ScholixUtils { } } - def extractRelationDate(summary: ScholixSummary): String = { + def extractRelationDate(summary: ScholixSummary):String = { - if (summary.getDate == null || summary.getDate.isEmpty) + if(summary.getDate== null || summary.getDate.isEmpty) null else { summary.getDate.get(0) @@ -59,14 +59,15 @@ object ScholixUtils { } - def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { + def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = { new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) } - val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { + + val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable { override def zero: RelatedEntities = null override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { @@ -77,16 +78,17 @@ object ScholixUtils { if (b == null) RelatedEntities(a._1, relatedDataset, relatedPublication) else - RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication) + RelatedEntities(a._1,b.relatedDataset+ relatedDataset, b.relatedPublication+ relatedPublication ) } override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { - if (b1 != null && b2 != null) - RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) + if (b1!= null && b2!= null) + RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication) - else if (b1 != null) - b1 else + if (b1!= null) + b1 + else b2 } @@ -102,12 +104,12 @@ object ScholixUtils { override def zero: Scholix = null - def scholix_complete(s: Scholix): Boolean = { - if (s == null || s.getIdentifier == null) { + def scholix_complete(s:Scholix):Boolean ={ + if (s== null || s.getIdentifier==null) { false } else if (s.getSource == null || s.getTarget == null) { - false - } + false + } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) false else @@ -119,7 +121,7 @@ object ScholixUtils { } override def merge(b1: Scholix, b2: Scholix): Scholix = { - if (scholix_complete(b1)) b1 else b2 + if (scholix_complete(b1)) b1 else b2 } override def finish(reduction: Scholix): Scholix = reduction @@ -130,7 +132,7 @@ object ScholixUtils { } - def createInverseScholixRelation(scholix: Scholix): Scholix = { + def createInverseScholixRelation(scholix: Scholix):Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -142,33 +144,34 @@ object ScholixUtils { s + } - def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { - if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { - val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { + def extractCollectedFrom(summary:ScholixSummary): List[ScholixEntityId] = { + if (summary.getDatasources!= null && !summary.getDatasources.isEmpty) { + val l: List[ScholixEntityId] = summary.getDatasources.asScala.map{ d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava) }(collection.breakOut) - l + l } else List() } - def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { + def extractCollectedFrom(relation: Relation) : List[ScholixEntityId] = { if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => - new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava) + new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA,null)).asJava) }(collection breakOut) l } else List() } - def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { + def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = { val s = new Scholix s.setPublicationDate(scholix.getPublicationDate) s.setPublisher(scholix.getPublisher) @@ -189,28 +192,29 @@ object ScholixUtils { r.setObjectType(summaryObject.getTypology.toString) r.setObjectSubType(summaryObject.getSubType) - if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty) - r.setTitle(summaryObject.getTitle.get(0)) + if (summaryObject.getTitle!= null && !summaryObject.getTitle.isEmpty) + r.setTitle(summaryObject.getTitle.get(0)) - if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { - val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList + if (summaryObject.getAuthor!= null && !summaryObject.getAuthor.isEmpty){ + val l:List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a,null)).toList if (l.nonEmpty) r.setCreator(l.asJava) } - if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) + if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty) r.setPublicationDate(summaryObject.getDate.get(0)) - if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { - val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList + if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty) + { + val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList if (plist.nonEmpty) r.setPublisher(plist.asJava) } - if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { + if (summaryObject.getDatasources!= null && !summaryObject.getDatasources.isEmpty) { - val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( + val l:List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom( new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava) , "collected", "complete" @@ -224,9 +228,12 @@ object ScholixUtils { } - def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { - if (relation == null || source == null) + + + def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = { + + if (relation== null || source== null) return null val s = new Scholix @@ -246,9 +253,9 @@ object ScholixUtils { s.setPublicationDate(d) - if (source.getPublisher != null && !source.getPublisher.isEmpty) { + if (source.getPublisher!= null && !source.getPublisher.isEmpty) { val l: List[ScholixEntityId] = source.getPublisher.asScala - .map { + .map{ p => new ScholixEntityId(p, null) }(collection.breakOut) @@ -258,7 +265,7 @@ object ScholixUtils { } val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) - if (semanticRelation == null) + if (semanticRelation== null) return null s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)) s.setSource(generateScholixResourceFromSummary(source)) @@ -267,8 +274,8 @@ object ScholixUtils { } - def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = { - pidValue.map { + def findURLForPID(pidValue:List[StructuredProperty], urls:List[String]):List[(StructuredProperty, String)] = { + pidValue.map{ p => val pv = p.getValue @@ -278,67 +285,67 @@ object ScholixUtils { } - def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { + def extractTypedIdentifierFromInstance(r:Result):List[ScholixIdentifier] = { if (r.getInstance() == null || r.getInstance().isEmpty) return List() - r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty) - .filter(i => i.getPid != null && i.getUrl != null) + r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty) + .filter(i => i.getPid!= null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList } - def resultToSummary(r: Result): ScholixSummary = { + def resultToSummary(r:Result):ScholixSummary = { val s = new ScholixSummary s.setId(r.getId) if (r.getPid == null || r.getPid.isEmpty) return null - val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) + val persistentIdentifiers:List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - if (r.isInstanceOf[Publication]) + if (r.isInstanceOf[Publication] ) s.setTypology(Typology.publication) else s.setTypology(Typology.dataset) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) - if (r.getTitle != null && r.getTitle.asScala.nonEmpty) { - val titles: List[String] = r.getTitle.asScala.map(t => t.getValue)(collection breakOut) + if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) { + val titles:List[String] =r.getTitle.asScala.map(t => t.getValue)(collection breakOut) if (titles.nonEmpty) s.setTitle(titles.asJava) else - return null + return null } - if (r.getAuthor != null && !r.getAuthor.isEmpty) { - val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname)(collection breakOut) + if(r.getAuthor!= null && !r.getAuthor.isEmpty) { + val authors:List[String] = r.getAuthor.asScala.map(a=> a.getFullname)(collection breakOut) if (authors nonEmpty) s.setAuthor(authors.asJava) } if (r.getInstance() != null) { - val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) + val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut) if (dt.nonEmpty) s.setDate(dt.distinct.asJava) } - if (r.getDescription != null && !r.getDescription.isEmpty) { - val d = r.getDescription.asScala.find(f => f != null && f.getValue != null) + if (r.getDescription!= null && !r.getDescription.isEmpty) { + val d = r.getDescription.asScala.find(f => f!= null && f.getValue!=null) if (d.isDefined) s.setDescription(d.get.getValue) } - if (r.getSubject != null && !r.getSubject.isEmpty) { - val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) + if (r.getSubject!= null && !r.getSubject.isEmpty) { + val subjects:List[SchemeValue] =r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut) if (subjects.nonEmpty) s.setSubject(subjects.asJava) } - if (r.getPublisher != null) + if (r.getPublisher!= null) s.setPublisher(List(r.getPublisher.getValue).asJava) - if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { - val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) + if (r.getCollectedfrom!= null && !r.getCollectedfrom.isEmpty) { + val cf:List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut) if (cf.nonEmpty) s.setDatasources(cf.distinct.asJava) } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestApply.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala similarity index 96% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala index 7abce547f..a3a753a8a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPrepare.scala @@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo +import eu.dnetlib.dhp.schema.oaf.{Datasource, OpenAccessRoute, Publication} +import javax.management.openmbean.OpenMBeanAttributeInfo import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.json4s import org.json4s.DefaultFormats +import eu.dnetlib.dhp.schema.common.ModelConstants import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala index 0922f2e19..5b00e9b6f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestPreprocess.scala @@ -4,9 +4,10 @@ import eu.dnetlib.dhp.schema.oaf.Datasource import org.apache.spark.SparkConf import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats -import org.json4s.jackson.Serialization.write -import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.Assertions.{assertNotNull, assertTrue} import org.junit.jupiter.api.Test +import org.junit.jupiter.api.Assertions._ +import org.json4s.jackson.Serialization.write class TestPreprocess extends java.io.Serializable{ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala index f1bd841d1..9a142d3c0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -159,7 +159,6 @@ class ResolveEntitiesTest extends Serializable { val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) - val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala similarity index 95% rename from dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala rename to dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala index 0d89cca85..b90827e81 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.sx.pangaea import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils import org.junit.jupiter.api.Test +import java.util.TimeZone import java.text.SimpleDateFormat import java.util.Date import scala.io.Source diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset index 2c73183e2..05c875148 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset @@ -1,3 +1,3 @@ -{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} -{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} -{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} \ No newline at end of file +{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} +{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} +{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} \ No newline at end of file From 0c0d561bc47c588d0049fcab1dc8e400efe1a7b7 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 19 Nov 2021 09:54:22 +0100 Subject: [PATCH 10/11] added public class into tests to create correct javadoc --- .../eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java | 2 +- .../oa/matchers/simple/EnrichMissingPublicationDateTest.java | 2 +- .../eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java | 2 +- .../test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java index 45bfc785f..52e9917bb 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcherTest.java @@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; @ExtendWith(MockitoExtension.class) -class UpdateMatcherTest { +public class UpdateMatcherTest { UpdateMatcher matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java index 550ded9f4..5af81a31a 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDateTest.java @@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test; import eu.dnetlib.broker.objects.OaBrokerMainEntity; -class EnrichMissingPublicationDateTest { +public class EnrichMissingPublicationDateTest { final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate(); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java index b532aa9f7..d93390e4a 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/SubscriptionUtilsTest.java @@ -8,7 +8,7 @@ import java.util.Arrays; import org.junit.jupiter.api.Test; -class SubscriptionUtilsTest { +public class SubscriptionUtilsTest { @Test void testVerifyListSimilar() { diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java index a8bc03e31..117bdeef4 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor; import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerTypedValue; -class TrustUtilsTest { +public class TrustUtilsTest { private static final double THRESHOLD = 0.95; From fc03c99805c540d03b82ea269d4ff6b4f3eec8cd Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 19 Nov 2021 10:46:33 +0100 Subject: [PATCH 11/11] fixed javadocs url after deploying site --- dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md index 00e3ed877..c1813394b 100644 --- a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md @@ -9,8 +9,8 @@ the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_description Parsing ------- -The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates -an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) +The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates +an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) Mapping