diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index b32039e32..a6a57ce7b 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -25,6 +25,11 @@ com.github.sisyphsu dateparser + + me.xuender + unidecode + + org.apache.spark spark-core_2.11 diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index e5181b111..1d002ed7e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.*; import java.util.function.Function; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; import com.google.common.collect.Sets; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import me.xuender.unidecode.Unidecode; public class GraphCleaningFunctions extends CleaningFunctions { @@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .filter( - sp -> sp - .getValue() - .toLowerCase() - .replaceAll(TITLE_FILTER_REGEX, "") - .length() > TITLE_FILTER_RESIDUAL_LENGTH) + sp -> { + final String title = sp + .getValue() + .toLowerCase(); + final String residual = Unidecode + .decode(title) + .replaceAll(TITLE_FILTER_REGEX, ""); + return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; + }) .map(GraphCleaningFunctions::cleanValue) .collect(Collectors.toList())); } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index eefa1e9a3..8d519a93f 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; import java.util.HashSet; import java.util.List; -import java.util.Locale; -import java.util.Optional; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; +import me.xuender.unidecode.Unidecode; public class OafMapperUtilsTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + @Test + public void testUnidecode() { + + assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ")); + assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛")); + assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼")); + assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい")); + assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի")); + assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики")); + assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ")); + assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης")); + assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية")); + assertEquals("abc def ghi", Unidecode.decode("abc def ghi")); + } + @Test public void testDateValidation() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala index 823187afe..92a870e37 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -64,26 +64,24 @@ abstract class AbstractRestClient extends Iterator[String]{ .setSocketTimeout(timeout * 1000).build() val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build() var tries = 4 - try { - while (tries > 0) { + while (tries > 0) { println(s"requesting ${r.getURI}") - val response = client.execute(r) - println(s"get response with status${response.getStatusLine.getStatusCode}") - if (response.getStatusLine.getStatusCode > 400) { - tries -= 1 + try { + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } + else + return IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + println(s"Error on requesting ${r.getURI}") + e.printStackTrace() + tries-=1 } - else - return IOUtils.toString(response.getEntity.getContent) } "" - } catch { - case e: Throwable => - throw new RuntimeException("Error on executing request ", e) - } finally try client.close() - catch { - case e: IOException => - throw new RuntimeException("Unable to close client ", e) - } - } + } getBufferData() } \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala new file mode 100644 index 000000000..b78f411ee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkCreateActionset.scala @@ -0,0 +1,73 @@ +package eu.dnetlib.dhp.actionmanager.scholix + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object SparkCreateActionset { + + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString) + parser.parseArgument(args) + + + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + + val sourcePath = parser.get("sourcePath") + log.info(s"sourcePath -> $sourcePath") + + val targetPath = parser.get("targetPath") + log.info(s"targetPath -> $targetPath") + + val workingDirFolder = parser.get("workingDirFolder") + log.info(s"workingDirFolder -> $workingDirFolder") + + implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result] + implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation] + + import spark.implicits._ + + val relation = spark.read.load(s"$sourcePath/relation").as[Relation] + + relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + .flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation") + + + val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String] + + log.info("extract source and target Identifier involved in relations") + + + log.info("save relation filtered") + + relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) + .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf") + + log.info("saving entities") + + val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders)) + + + entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]) + entities + .joinWith(idRelation, entities("_1").equalTo(idRelation("value"))) + .map(p => p._1._2) + .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") + + + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala similarity index 86% rename from dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala index d1d0b8424..1df7ea3fb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkSaveActionSet.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/scholix/SparkSaveActionSet.scala @@ -1,9 +1,9 @@ -package eu.dnetlib.dhp.sx.provision +package eu.dnetlib.dhp.actionmanager.scholix import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.action.AtomicAction -import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset} +import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapred.SequenceFileOutputFormat @@ -73,13 +73,13 @@ object SparkSaveActionSet { val targetPath = parser.get("targetPath") log.info(s"targetPath -> $targetPath") - implicit val oafEncoders:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING) + implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING) spark.read.load(sourcePath).as[Oaf] - .map(o =>toActionSet(o)) - .filter(o => o!= null) - .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + .map(o => toActionSet(o)) + .filter(o => o != null) + .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec]) } diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json similarity index 100% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml similarity index 92% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml index ef86a1772..2d97b5163 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml @@ -14,7 +14,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -25,7 +25,7 @@ yarn-cluster cluster Create Action Set - eu.dnetlib.dhp.sx.provision.SparkCreateActionset + eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -42,7 +42,7 @@ --workingDirFolder${workingDirFolder} --masteryarn-cluster - + @@ -52,7 +52,7 @@ yarn-cluster cluster Save Action Set - eu.dnetlib.dhp.sx.provision.SparkSaveActionSet + eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json similarity index 100% rename from dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java index 21786687e..bcbcf755f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsReferencedBy.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset { @@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat @Override protected boolean filterByType(final String relType) { - return relType.equals("isReferencedBy"); + return relType.equals(ModelConstants.IS_REFERENCED_BY); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java index 0f3739434..4125974ce 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsRelatedTo.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset { @@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase @Override protected boolean filterByType(final String relType) { - return relType.equals("isRelatedTo"); + return relType.equals(ModelConstants.IS_RELATED_TO); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java index cde227fee..480daf666 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedBy.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset { @@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD @Override protected boolean filterByType(final String relType) { - return relType.equals("isSupplementedBy"); + return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java index 750165ff5..97b1eb8bd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetIsSupplementedTo.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset { @@ -11,7 +12,7 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD @Override protected boolean filterByType(final String relType) { - return relType.equals("isSupplementedTo"); + return relType.equals(ModelConstants.IS_SUPPLEMENT_TO); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java index b1c0afe16..0978486a3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/EnrichMissingDatasetReferences.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset { @@ -11,7 +12,7 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset @Override protected boolean filterByType(final String relType) { - return relType.equals("references"); + return relType.equals(ModelConstants.REFERENCES); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java index eebb5c1a6..ff9155c9d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsReferencedBy.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication { @@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin @Override protected boolean filterByType(final String relType) { - return relType.equals("isReferencedBy"); + return relType.equals(ModelConstants.IS_REFERENCED_BY); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java index a8aa550d4..1051559c9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsRelatedTo.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication { @@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu @Override protected boolean filterByType(final String relType) { - return relType.equals("isRelatedTo"); + return relType.equals(ModelConstants.IS_RELATED_TO); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java index 762ac942e..d97f46f09 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedBy.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication { @@ -11,6 +12,6 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss @Override protected boolean filterByType(final String relType) { - return relType.equals("isSupplementedBy"); + return relType.equals(ModelConstants.IS_SUPPLEMENTED_BY); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java index fc7196a01..b33b340e3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationIsSupplementedTo.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication { @@ -11,7 +12,7 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss @Override protected boolean filterByType(final String relType) { - return relType.equals("isSupplementedTo"); + return relType.equals(ModelConstants.IS_SUPPLEMENT_TO); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java index da1994454..fe0f96b6e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/EnrichMissingPublicationReferences.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications; import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.common.ModelConstants; public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication { @@ -11,7 +12,7 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub @Override protected boolean filterByType(final String relType) { - return relType.equals("references"); + return relType.equals(ModelConstants.REFERENCES); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java index c7be633a9..7c4ca1d22 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ClusterUtils.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class ClusterUtils { @@ -52,15 +53,15 @@ public class ClusterUtils { } public static boolean isDedupRoot(final String id) { - return id.contains("dedup_wf_"); + return id.contains("dedup"); } public static final boolean isValidResultResultClass(final String s) { - return s.equals("isReferencedBy") - || s.equals("isRelatedTo") - || s.equals("references") - || s.equals("isSupplementedBy") - || s.equals("isSupplementedTo"); + return s.equals(ModelConstants.IS_REFERENCED_BY) + || s.equals(ModelConstants.IS_RELATED_TO) + || s.equals(ModelConstants.REFERENCES) + || s.equals(ModelConstants.IS_SUPPLEMENTED_BY) + || s.equals(ModelConstants.IS_SUPPLEMENT_TO); } public static T incrementAccumulator(final T o, final LongAccumulator acc) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 502cb370f..909e0f077 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -179,20 +179,6 @@ object DoiBoostMappingUtil { } - //val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") - - - - // val pub_date = LocalDate.parse(date, formatter) - -// if (((now.toEpochDay - pub_date.toEpochDay)/365.0) > 1){ -// val oaq : AccessRight = getOpenAccessQualifier() -// oaq.setOpenAccessRoute(OpenAccessRoute.hybrid) -// return oaq -// } -// else{ -// return getEmbargoedAccessQualifier() -// } } return getClosedAccessQualifier() @@ -202,16 +188,17 @@ object DoiBoostMappingUtil { def getOpenAccessQualifier():AccessRight = { - OafMapperUtils.accessRight("OPEN","Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + + OafMapperUtils.accessRight(ModelConstants.ACCESS_RIGHT_OPEN,"Open Access", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) } def getRestrictedQualifier():AccessRight = { - OafMapperUtils.accessRight("RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + OafMapperUtils.accessRight( "RESTRICTED","Restricted",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) } def getUnknownQualifier():AccessRight = { - OafMapperUtils.accessRight("UNKNOWN","not available",ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) + OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) } @@ -251,8 +238,7 @@ object DoiBoostMappingUtil { i.setAccessright(getOpenAccessQualifier()) i.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold) } -// val ar = getOpenAccessQualifier() -// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) + } else { hb = ModelConstants.UNKNOWN_REPOSITORY @@ -261,17 +247,7 @@ object DoiBoostMappingUtil { }) publication.setBestaccessright(OafMapperUtils.createBestAccessRights(publication.getInstance())) -// val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid) -// if (ar.nonEmpty) { -// if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){ -// val ar = getOpenAccessQualifier() -// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) -// } -// else { -// val ar = getRestrictedQualifier() -// publication.setBestaccessright(OafMapperUtils.qualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) -// } -// } + publication } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java new file mode 100644 index 000000000..c2bcf69f0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java @@ -0,0 +1,115 @@ +package eu.dnetlib.dhp.oa.graph.raw; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.FileNotFoundException; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +public class PatchRelationsApplication { + + private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Optional.ofNullable( + PatchRelationsApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json")) + .orElseThrow(FileNotFoundException::new) + )); + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String graphBasePath = parser.get("graphBasePath"); + log.info("graphBasePath: {}", graphBasePath); + + final String workingDir = parser.get("workingDir"); + log.info("workingDir: {}", workingDir); + + final String idMappingPath = parser.get("idMappingPath"); + log.info("idMappingPath: {}", idMappingPath); + + final SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath)); + } + + /** + * Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the + * mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location. + * + * @param spark the SparkSession + * @param graphBasePath base graph path providing the set of relations to patch + * @param workingDir intermediate storage location + * @param idMappingPath dataset providing the old -> new identifier mapping + */ + private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir, final String idMappingPath) { + + final String relationPath = graphBasePath + "/relation"; + + final Dataset rels = Utils.readPath(spark, relationPath, Relation.class); + final Dataset idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class); + + rels + .joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left") + .map((MapFunction, Relation>) t -> { + final Relation r = t._1(); + Optional.ofNullable(t._2()) + .map(RelationIdMapping::getNewId) + .ifPresent(r::setSource); + return r; + }, Encoders.bean(Relation.class)) + .joinWith(idMapping, rels.col("target").equalTo(idMapping.col("oldId")), "left") + .map((MapFunction, Relation>) t -> { + final Relation r = t._1(); + Optional.ofNullable(t._2()) + .map(RelationIdMapping::getNewId) + .ifPresent(r::setTarget); + return r; + }, Encoders.bean(Relation.class)) + .map( + (MapFunction) OBJECT_MAPPER::writeValueAsString, + Encoders.STRING()) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(workingDir); + + spark.read().textFile(workingDir) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(relationPath); + } + + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java new file mode 100644 index 000000000..f251da8c3 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/RelationIdMapping.java @@ -0,0 +1,24 @@ +package eu.dnetlib.dhp.oa.graph.raw.common; + +public class RelationIdMapping { + + private String oldId; + + private String newId; + + public String getOldId() { + return oldId; + } + + public void setOldId(final String oldId) { + this.oldId = oldId; + } + + public String getNewId() { + return newId; + } + + public void setNewId(final String newId) { + this.newId = newId; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json new file mode 100644 index 000000000..178c2d69b --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "g", + "paramLongName": "graphBasePath", + "paramDescription": "base graph path providing the set of relations to patch", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingDir", + "paramDescription": "intermediate storage location", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "idMappingPath", + "paramDescription": "dataset providing the old -> new identifier mapping", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 7f1ecb39f..321ca4090 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -100,6 +100,16 @@ a blacklist of nsprefixes (comma separeted) + + shouldPatchRelations + false + activates the relation patching phase, driven by the content in ${idMappingPath} + + + idMappingPath + + path pointing to the relations identifiers mapping dataset + sparkDriverMemory memory for driver process @@ -551,7 +561,6 @@ - yarn @@ -760,7 +769,42 @@ - + + + + + + ${(shouldPatchRelations eq "true") and + (fs:exists(concat(concat(wf:conf('nameNode'),'/'),wf:conf('idMappingPath'))) eq "true")} + + + + + + + + yarn + cluster + PatchRelations + eu.dnetlib.dhp.oa.graph.raw.PatchRelationsApplication + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --graphBasePath${graphOutputPath} + --workingDir${workingDir}/patch_relations + --idMappingPath${idMappingPath} + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 63f18a803..2fbdb3f42 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1,6 +1,7 @@ - package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup; +import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.lenient; @@ -38,8 +39,8 @@ public class MappersTest { public void setUp() throws Exception { lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); lenient() - .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) - .thenReturn(synonyms()); + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); vocs = VocabularyGroup.loadVocsFromIS(isLookUpService); } @@ -74,18 +75,18 @@ public class MappersTest { assertTrue(p.getAuthor().size() > 0); final Optional author = p - .getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); final StructuredProperty pid = author - .get() - .getPid() - .stream() - .findFirst() - .get(); + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); @@ -104,12 +105,12 @@ public class MappersTest { assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p - .getInstance() - .stream() - .forEach(i -> { - assertNotNull(i.getAccessright()); - assertEquals("OPEN", i.getAccessright().getClassid()); - }); + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("OPEN", i.getAccessright().getClassid()); + }); assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getInstance().get(0).getPid()); assertTrue(p.getInstance().get(0).getPid().isEmpty()); @@ -170,18 +171,18 @@ public class MappersTest { assertTrue(p.getAuthor().size() > 0); final Optional author = p - .getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); final StructuredProperty pid = author - .get() - .getPid() - .stream() - .findFirst() - .get(); + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-6651-1178", pid.getValue()); assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); @@ -199,12 +200,12 @@ public class MappersTest { assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p - .getInstance() - .stream() - .forEach(i -> { - assertNotNull(i.getAccessright()); - assertEquals("OPEN", i.getAccessright().getClassid()); - }); + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("OPEN", i.getAccessright().getClassid()); + }); assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); assertNotNull(p.getInstance().get(0).getPid()); assertTrue(p.getInstance().get(0).getPid().size() == 2); @@ -256,17 +257,17 @@ public class MappersTest { assertTrue(d.getAuthor().size() > 0); final Optional author = d - .getAuthor() - .stream() - .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) - .findFirst(); + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); assertTrue(author.isPresent()); final StructuredProperty pid = author - .get() - .getPid() - .stream() - .findFirst() - .get(); + .get() + .getPid() + .stream() + .findFirst() + .get(); assertEquals("0000-0001-9074-1619", pid.getValue()); assertEquals(ModelConstants.ORCID_PENDING, pid.getQualifier().getClassid()); assertEquals(ModelConstants.ORCID_CLASSNAME, pid.getQualifier().getClassname()); @@ -278,10 +279,10 @@ public class MappersTest { assertEquals(1, author.get().getAffiliation().size()); final Optional> opAff = author - .get() - .getAffiliation() - .stream() - .findFirst(); + .get() + .getAffiliation() + .stream() + .findFirst(); assertTrue(opAff.isPresent()); final Field affiliation = opAff.get(); assertEquals("ISTI-CNR", affiliation.getValue()); @@ -294,12 +295,12 @@ public class MappersTest { assertNotNull(d.getInstance()); assertTrue(d.getInstance().size() > 0); d - .getInstance() - .stream() - .forEach(i -> { - assertNotNull(i.getAccessright()); - assertEquals("OPEN", i.getAccessright().getClassid()); - }); + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("OPEN", i.getAccessright().getClassid()); + }); assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertNotNull(d.getInstance().get(0).getPid()); assertTrue(d.getInstance().get(0).getPid().isEmpty()); @@ -343,14 +344,13 @@ public class MappersTest { assertTrue(p.getOriginalId().stream().anyMatch(oid -> oid.equals("oai:pub.uni-bielefeld.de:2949739"))); // assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0)); - assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(p.getAuthor().size() > 0); final Optional author = p - .getAuthor() - .stream() - .findFirst(); + .getAuthor() + .stream() + .findFirst(); assertTrue(author.isPresent()); assertEquals("Potwarka, Luke R.", author.get().getFullname()); @@ -366,12 +366,12 @@ public class MappersTest { assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p - .getInstance() - .stream() - .forEach(i -> { - assertNotNull(i.getAccessright()); - assertEquals("OPEN", i.getAccessright().getClassid()); - }); + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("OPEN", i.getAccessright().getClassid()); + }); assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); } @@ -412,11 +412,11 @@ public class MappersTest { assertNotNull(d.getTitle()); assertEquals(1, d.getTitle().size()); assertEquals( - "Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia", - d - .getTitle() - .get(0) - .getValue()); + "Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia", + d + .getTitle() + .get(0) + .getValue()); assertNotNull(d.getDescription()); assertEquals(1, d.getDescription().size()); @@ -559,6 +559,31 @@ public class MappersTest { assertNotNull(d.getInstance().get(0).getUrl()); } + @Test + void testEnermaps() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("enermaps.xml")); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Dataset); + + final Dataset d = (Dataset) list.get(0); + + assertValidId(d.getId()); + assertValidId(d.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertEquals(1, d.getAuthor().size()); + assertEquals(1, d.getInstance().size()); + assertNotNull(d.getInstance().get(0).getUrl()); + assertNotNull(d.getContext()); + assertTrue(StringUtils.isNotBlank(d.getContext().get(0).getId())); + assertEquals("enermaps::selection::tgs00004", d.getContext().get(0).getId()); + } + @Test void testClaimFromCrossref() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_claim_crossref.xml")); @@ -615,7 +640,7 @@ public class MappersTest { assertValidId(p.getInstance().get(0).getCollectedfrom().getKey()); assertValidId(p.getInstance().get(0).getHostedby().getKey()); assertEquals( - "http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue()); + "http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue()); assertEquals(1, p.getInstance().size()); assertNotNull(p.getInstance().get(0).getAlternateIdentifier()); @@ -640,6 +665,30 @@ public class MappersTest { System.out.println(p.getTitle().get(0).getValue()); } + @Test + void testJairo() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml")); + final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + + assertNotNull(p.getTitle()); + assertFalse(p.getTitle().isEmpty()); + assertTrue(p.getTitle().size() == 1); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + + final Publication p_cleaned = cleanup(fixVocabularyNames(p)); + + assertNotNull(p_cleaned.getTitle()); + assertFalse(p_cleaned.getTitle().isEmpty()); + } + @Test void testOdfFromHdfs() throws IOException { final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml")); @@ -661,9 +710,9 @@ public class MappersTest { assertTrue(p.getAuthor().size() > 0); final Optional author = p - .getAuthor() - .stream() - .findFirst(); + .getAuthor() + .stream() + .findFirst(); assertTrue(author.isPresent()); assertEquals("Museum Sønderjylland", author.get().getFullname()); @@ -677,12 +726,12 @@ public class MappersTest { assertNotNull(p.getInstance()); assertTrue(p.getInstance().size() > 0); p - .getInstance() - .stream() - .forEach(i -> { - assertNotNull(i.getAccessright()); - assertEquals("UNKNOWN", i.getAccessright().getClassid()); - }); + .getInstance() + .stream() + .forEach(i -> { + assertNotNull(i.getAccessright()); + assertEquals("UNKNOWN", i.getAccessright().getClassid()); + }); assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); } @@ -697,14 +746,14 @@ public class MappersTest { private List vocs() throws IOException { return IOUtils - .readLines( - GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); + .readLines( + GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); } private List synonyms() throws IOException { return IOUtils - .readLines( - GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); + .readLines( + GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml new file mode 100644 index 000000000..362b40c85 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/enermaps.xml @@ -0,0 +1,72 @@ + + + + enermaps____::04149ee428d07360314c2cb3ba95d41e + tgs00004 + 2021-07-20T18:43:12.096+02:00 + enermaps____ + + + + https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004 + + + Statistical Office of the European Union (Eurostat) + + + + + Regional GDP + + + Statistical Office of the European Union (Eurostat) + 2020 + + 2020-10-07 + + + + OPEN + Creative Commons Attribution 4.0 International + + + GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy. + + 0021 + 2020-10-07 + OPEN + Creative Commons Attribution 4.0 International + + + + + + + + + https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite + + + + + + + false + false + 0.9 + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml new file mode 100644 index 000000000..9ec696256 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_jairo.xml @@ -0,0 +1,70 @@ + + +
+ jairo_______::000012e58ed836576ef2a0d38b0f726f + oai:irdb.nii.ac.jp:01221:0000010198 + + + + + + 2021-05-10T11:31:09.424Z + 2021-06-03T01:45:42.536Z + jairo_______ +
+ + 多項式GCDを用いた復号法に関する研究 + 上原, 剛 + 甲斐, 博 + 野田, 松太郎 + application/pdf + http://hdl.handle.net/2433/25934 + jpn + 京都大学数理解析研究所 + 410 + Departmental Bulletin Paper + 0014 + 2004-10-01 + + openaire____::554c7c2873 + OPEN + + + 2433/25934 + AN00061013 + http://hdl.handle.net/2433/25934 + http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf + 数理解析研究所講究録 + + + + + https%3A%2F%2Firdb.nii.ac.jp%2Foai + oai:irdb.nii.ac.jp:01221:0000010198 + 2021-04-13T13:36:29Z + + + http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request + oai:repository.kulib.kyoto-u.ac.jp:2433/25934 + 2012-07-12T14:15:41Z + http://irdb.nii.ac.jp/oai + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index c279436d7..e402d0600 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -9,6 +9,41 @@ dhp-graph-provision + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + -Xmax-classfile-name + 200 + + ${scala.version} + + + + + + diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 7d53d3554..b3f785492 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -10,6 +10,7 @@ import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -81,6 +82,7 @@ public class PrepareRelationsJob { Set relationFilter = Optional .ofNullable(parser.get("relationFilter")) + .map(String::toLowerCase) .map(s -> Sets.newHashSet(Splitter.on(",").split(s))) .orElse(new HashSet<>()); log.info("relationFilter: {}", relationFilter); @@ -130,7 +132,7 @@ public class PrepareRelationsJob { JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath) .filter(rel -> rel.getDataInfo().getDeletedbyinference() == false) - .filter(rel -> relationFilter.contains(rel.getRelClass()) == false); + .filter(rel -> relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())) == false); JavaRDD pruned = pruneRels( pruneRels( diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java index f96a64a27..ffeb0995d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/DropAndCreateESIndex.java @@ -71,6 +71,9 @@ public class DropAndCreateESIndex { log.info(STATUS_CODE_TEXT, response.getStatusLine()); } + log.info("Sleeping 60 seconds to avoid to lost the creation of index request"); + Thread.sleep(60000); + try (CloseableHttpClient client = HttpClients.createDefault()) { final String summaryConf = IOUtils diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala deleted file mode 100644 index 6f0cdcf8a..000000000 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/sx/provision/SparkCreateActionset.scala +++ /dev/null @@ -1,90 +0,0 @@ -package eu.dnetlib.dhp.sx.provision - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} -import org.apache.spark.{SparkConf, sql} -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.io.Source - -object SparkCreateActionset { - - def main(args: Array[String]): Unit = { - val log: Logger = LoggerFactory.getLogger(getClass) - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString) - parser.parseArgument(args) - - - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val sourcePath = parser.get("sourcePath") - log.info(s"sourcePath -> $sourcePath") - - val targetPath = parser.get("targetPath") - log.info(s"targetPath -> $targetPath") - - val workingDirFolder = parser.get("workingDirFolder") - log.info(s"workingDirFolder -> $workingDirFolder") - - implicit val oafEncoders:Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val resultEncoders:Encoder[Result] = Encoders.kryo[Result] - implicit val relationEncoders:Encoder[Relation] = Encoders.kryo[Relation] - - import spark.implicits._ - - val relation = spark.read.load(s"$sourcePath/relation").as[Relation] - - relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) - .flatMap(r => List(r.getSource,r.getTarget)).distinct().write.save(s"$workingDirFolder/id_relation") - - - val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String] - - log.info("extract source and target Identifier involved in relations") - - - log.info("save relation filtered") - - relation.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge")) - .write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf") - - log.info("saving publication") - - val publication:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/publication").as[Result].map(p => (p.getId, p)) - - publication - .joinWith(idRelation, publication("_1").equalTo(idRelation("value"))) - .map(p => p._1._2) - .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - - log.info("saving dataset") - val dataset:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/dataset").as[Result].map(p => (p.getId, p)) - dataset - .joinWith(idRelation, publication("_1").equalTo(idRelation("value"))) - .map(p => p._1._2) - .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - - log.info("saving software") - val software:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/software").as[Result].map(p => (p.getId, p)) - software - .joinWith(idRelation, publication("_1").equalTo(idRelation("value"))) - .map(p => p._1._2) - .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - - log.info("saving Other Research product") - val orp:Dataset[(String, Result)] = spark.read.load(s"$sourcePath/otherresearchproduct").as[Result].map(p => (p.getId, p)) - orp - .joinWith(idRelation, publication("_1").equalTo(idRelation("value"))) - .map(p => p._1._2) - .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") - } - -} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 6631cb4da..221049f90 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -21,8 +21,10 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; +import eu.dnetlib.dhp.oa.provision.utils.ContextDef; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -131,4 +133,32 @@ public class XmlRecordFactoryTest { System.out.println(doc.asXML()); assertEquals("", doc.valueOf("//rel/validated")); } + + @Test + public void testEnermapsRecord() throws IOException, DocumentException { + + String contextmap = "" + + + "" + + "" + + ""; + + ContextMapper contextMapper = ContextMapper.fromXml(contextmap); + XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation, + otherDsTypeId); + + Dataset d = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class); + + JoinedEntity je = new JoinedEntity<>(d); + + String xml = xmlRecordFactory.build(je); + + assertNotNull(xml); + + Document doc = new SAXReader().read(new StringReader(xml)); + assertNotNull(doc); + System.out.println(doc.asXML()); + assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id")); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json new file mode 100644 index 000000000..dcd4c2ee1 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/enermaps.json @@ -0,0 +1 @@ +{"collectedfrom":[{"key":"10|enermaps____::d77d5e503ad1439f585ac494268b351b","value":"Enermaps","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626800904248,"id":"50|enermaps____::04149ee428d07360314c2cb3ba95d41e","originalId":["50|enermaps____::04149ee428d07360314c2cb3ba95d41e","tgs00004"],"pid":[],"dateofcollection":"2021-07-20T18:43:12.096+02:00","dateoftransformation":"","extraInfo":[],"oaiprovenance":{"originDescription":{"harvestDate":"2021-07-20T18:43:12.096+02:00","altered":true,"baseURL":"https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite","identifier":"","datestamp":"","metadataNamespace":""}},"measures":null,"author":[{"fullname":"Statistical Office of the European Union (Eurostat)","name":"","surname":"","rank":1,"pid":[],"affiliation":[]}],"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:languages","schemename":"dnet:languages"},"country":[],"subject":[],"title":[{"value":"\n Regional GDP\n ","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"relevantdate":[{"value":"2020-10-07","qualifier":{"classid":"Issued","classname":"Issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"description":[{"value":"GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"dateofacceptance":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"publisher":{"value":"Statistical Office of the European Union (Eurostat)","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"embargoenddate":null,"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"coverage":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[{"id":"enermaps::selection::tgs00004","dataInfo":[{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}]}],"externalReference":[],"instance":[{"license":{"value":"Creative Commons Attribution 4.0 International","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes","openAccessRoute":null},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"hostedby":{"key":"10|openaire____::55045bd2a65019fd8e6741a755395c8c","value":"Unknown Repository","dataInfo":null},"url":["https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004"],"distributionlocation":null,"collectedfrom":{"key":"10|enermaps____::d77d5e503ad1439f585ac494268b351b","value":"Enermaps","dataInfo":null},"pid":[],"alternateIdentifier":[],"dateofacceptance":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"processingchargeamount":null,"processingchargecurrency":null,"refereed":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"}}],"storagedate":{"value":"2020-10-07","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":"","provenanceaction":{"classid":"sysimport:crosswalk","classname":"sysimport:crosswalk","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},"device":null,"size":null,"version":null,"lastmetadataupdate":null,"metadataversionnumber":null,"geolocation":[]} diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh index d5aa207d1..fb944f4ff 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/indicators.sh @@ -13,7 +13,7 @@ echo "Getting file from " $SCRIPT_PATH hdfs dfs -copyToLocal $SCRIPT_PATH echo "Creating indicators" -impala-shell -d ${TARGET} -q "invalidate metadata" +impala-shell -q "invalidate metadata" impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f - cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f - echo "Indicators created" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index d79396b3b..e4e81175c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -57,12 +57,14 @@ UNION ALL SELECT * FROM ${stats_db_name}.software_sources UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; --- --- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_sources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_sources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_sources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_sources COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + + +create table ${stats_db_name}.result_orcid as +select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid +from ( + SELECT substr(res.id, 4) as id, auth_pid.value as orcid + FROM ${openaire_db_name}.result res + LATERAL VIEW explode(author) a as auth + LATERAL VIEW explode(auth.pid) ap as auth_pid + LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type + WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql index 8f364d747..8e66e05c0 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql @@ -33,13 +33,4 @@ select * from ${stats_db_name}.dataset_refereed union all select * from ${stats_db_name}.software_refereed union all -select * from ${stats_db_name}.otherresearchproduct_refereed; --- --- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_refereed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_refereed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_refereed COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_refereed COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +select * from ${stats_db_name}.otherresearchproduct_refereed; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql index 8998cb9fc..f1ebf0d87 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_7-createIndicatorsTables.sql @@ -39,4 +39,198 @@ from publication p join result_instance ri on ri.id = p.id join datasource on datasource.id = ri.hostedby where datasource.id like '%doajarticles%') tmp -on p.id= tmp.id; \ No newline at end of file +on p.id= tmp.id; + +create table indi_project_pubs_count stored as parquet as +select pr.id id, count(p.id) total_pubs from project_results pr +join publication p on p.id=pr.result +group by pr.id; + +create table indi_project_datasets_count stored as parquet as +select pr.id id, count(d.id) total_datasets from project_results pr +join dataset d on d.id=pr.result +group by pr.id; + +create table indi_project_software_count stored as parquet as +select pr.id id, count(s.id) total_software from project_results pr +join software s on s.id=pr.result +group by pr.id; + +create table indi_project_otherresearch_count stored as parquet as +select pr.id id, count(o.id) total_other from project_results pr +join otherresearchproduct o on o.id=pr.result +group by pr.id; + +create table indi_pub_avg_year_country_oa stored as parquet as +select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA + from + (SELECT year, country, SUM(CASE + WHEN bestlicence='Open Access' THEN 1 + ELSE 0 + END) AS OpenAccess, SUM(CASE + WHEN bestlicence<>'Open Access' THEN 1 + ELSE 0 + END) AS NonOpenAccess + FROM publication p + join result_organization ro on p.id=ro.id + join organization o on o.id=ro.organization + where cast(year as int)>=2003 and cast(year as int)<=2021 + group by year, country) tmp; + +create table indi_dataset_avg_year_country_oa stored as parquet as +select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA + from + (SELECT year, country, SUM(CASE + WHEN bestlicence='Open Access' THEN 1 + ELSE 0 + END) AS OpenAccess, SUM(CASE + WHEN bestlicence<>'Open Access' THEN 1 + ELSE 0 + END) AS NonOpenAccess + FROM dataset d + join result_organization ro on d.id=ro.id + join organization o on o.id=ro.organization + where cast(year as int)>=2003 and cast(year as int)<=2021 + group by year, country) tmp; + +create table indi_software_avg_year_country_oa stored as parquet as +select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA + from + (SELECT year, country, SUM(CASE + WHEN bestlicence='Open Access' THEN 1 + ELSE 0 + END) AS OpenAccess, SUM(CASE + WHEN bestlicence<>'Open Access' THEN 1 + ELSE 0 + END) AS NonOpenAccess + FROM software s + join result_organization ro on s.id=ro.id + join SOURCER.organization o on o.id=ro.organization + where cast(year as int)>=2003 and cast(year as int)<=2021 + group by year, country) tmp; + + +create table indi_other_avg_year_country_oa stored as parquet as +select year, country, round(OpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageOA, +round(NonOpenAccess/(OpenAccess+NonOpenAccess)*100,3) as averageNonOA + from + (SELECT year, country, SUM(CASE + WHEN bestlicence='Open Access' THEN 1 + ELSE 0 + END) AS OpenAccess, SUM(CASE + WHEN bestlicence<>'Open Access' THEN 1 + ELSE 0 + END) AS NonOpenAccess + FROM otherresearchproduct orp + join result_organization ro on orp.id=ro.id + join organization o on o.id=ro.organization + where cast(year as int)>=2003 and cast(year as int)<=2021 + group by year, country) tmp; + +create table indi_pub_avg_year_context_oa stored as parquet as +with total as +(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from publication_concepts pc +join context c on pc.concept like concat('%',c.id,'%') +join publication p on p.id=pc.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by c.name, year ) +select year, name, round(no_of_pubs/total*100,3) averageofpubs +from total; + +create table indi_dataset_avg_year_context_oa stored as parquet as +with total as +(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from dataset_concepts pc +join context c on pc.concept like concat('%',c.id,'%') +join dataset p on p.id=pc.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by c.name, year ) +select year, name, round(no_of_pubs/total*100,3) averageofdataset +from total; + +create table indi_software_avg_year_context_oa stored as parquet as +with total as +(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from software_concepts pc +join context c on pc.concept like concat('%',c.id,'%') +join software p on p.id=pc.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by c.name, year ) +select year, name, round(no_of_pubs/total*100,3) averageofsoftware +from total; + +create table indi_other_avg_year_context_oa stored as parquet as +with total as +(select count(distinct pc.id) no_of_pubs, year, c.name name, sum(count(distinct pc.id)) over(PARTITION by year) as total from otherresearchproduct_concepts pc +join context c on pc.concept like concat('%',c.id,'%') +join otherresearchproduct p on p.id=pc.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by c.name, year ) +select year, name, round(no_of_pubs/total*100,3) averageofother +from total; + +create table indi_other_avg_year_content_oa stored as parquet as +with total as +(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +from otherresearchproduct_datasources pd +join datasource d on datasource=d.id +join otherresearchproduct p on p.id=pd.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by d.type, year) +select year, type, round(no_of_pubs/total*100,3) averageOfOtherresearchproduct +from total; + +create table indi_software_avg_year_content_oa stored as parquet as +with total as +(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +from software_datasources pd +join datasource d on datasource=d.id +join software p on p.id=pd.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by d.type, year) +select year, type, round(no_of_pubs/total*100,3) averageOfSoftware +from total; + +create table indi_dataset_avg_year_content_oa stored as parquet as +with total as +(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +from dataset_datasources pd +join datasource d on datasource=d.id +join dataset p on p.id=pd.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by d.type, year) +select year, type, round(no_of_pubs/total*100,3) averageOfDatasets +from total; + +create table indi_pub_avg_year_content_oa stored as parquet as +with total as +(select count(distinct pd.id) no_of_pubs, year, d.type type, sum(count(distinct pd.id)) over(PARTITION by year) as total +from publication_datasources pd +join datasource d on datasource=d.id +join publication p on p.id=pd.id +where cast(year as int)>=2003 and cast(year as int)<=2021 +group by d.type, year) +select year, type, round(no_of_pubs/total*100,3) averageOfPubs +from total; + +create table indi_pub_has_cc_licence stored as parquet as +select distinct p.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license +from publication p +left outer join (select p.id, license.type as lic from publication p +join publication_licenses as license on license.id = p.id +where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp +on p.id= tmp.id; + +create table indi_pub_has_cc_licence_url stored as parquet as +select distinct p.id, (case when lic_host='' or lic_host is null then 0 else 1 end) as has_cc_license_url +from publication p +left outer join (select p.id, lower(parse_url(license.type, "HOST")) as lic_host +from publication p +join publication_licenses as license on license.id = p.id +WHERE lower(parse_url(license.type, 'HOST')) = 'creativecommons.org') tmp +on p.id= tmp.id; + +create table indi_pub_has_abstract stored as parquet as +select distinct publication.id, coalesce(abstract, 1) has_abstract +from publication; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 75b24b189..bb0d0ac6c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -90,27 +90,8 @@ FROM ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.publication_citations AS -SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result +SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" - and p.datainfo.deletedbyinference = false; - --- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_classifications COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_concepts COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_datasources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_languages COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_topics COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.publication_citations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + and p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index 540cc03a5..953eaad6a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -41,7 +41,7 @@ FROM ${openaire_db_name}.dataset d WHERE d.datainfo.deletedbyinference = FALSE; CREATE TABLE ${stats_db_name}.dataset_citations AS -SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result +SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -95,21 +95,4 @@ CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; --- --- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_classifications COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_concepts COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_datasources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_languages COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.dataset_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index 54345e074..0210dc8cb 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -41,7 +41,7 @@ from ${openaire_db_name}.software s where s.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.software_citations AS -SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT +SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" @@ -95,21 +95,4 @@ CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; --- --- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_classifications COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_concepts COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_datasources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_languages COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.software_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index 36ad5d92a..f7b302186 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -41,7 +41,7 @@ WHERE o.datainfo.deletedbyinference = FALSE; -- Otherresearchproduct_citations CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS -SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT +SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" and o.datainfo.deletedbyinference = false; @@ -86,21 +86,4 @@ where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject -where p.datainfo.deletedbyinference = false; - --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_classifications COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_concepts COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_datasources COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_languages COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_pids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.otherresearchproduct_topics COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +where p.datainfo.deletedbyinference = false; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 5d81e97bb..378e0f17b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -13,11 +13,17 @@ WHERE r.reltype = 'projectOrganization' and r.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.project_results AS -SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result +SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultProject' and r.datainfo.deletedbyinference = false; +create table ${stats_db_name}.project_classification as +select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3 +from ${openaire_db_name}.project p + lateral view explode(p.h2020classification) classifs as class +where p.datainfo.deletedbyinference=false and class.h2020programme is not null; + CREATE TABLE ${stats_db_name}.project_tmp ( id STRING, diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index ae540b9b2..b3cbc9b41 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -130,12 +130,7 @@ WHERE r.reltype = 'resultOrganization' and r.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.result_projects AS -select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend +select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id = pr.result - JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; - --- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_organization COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.result_projects COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file + JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index de0fedd7e..76d31eb5e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -17,7 +17,9 @@ CREATE TABLE ${stats_db_name}.datasource_tmp `latitude` STRING, `longitude` STRING, `websiteurl` STRING, - `compatibility` STRING + `compatibility` STRING, + issn_printed STRING, + issn_online STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true'); -- Insert statement that takes into account the piwik_id of the openAIRE graph @@ -32,7 +34,9 @@ SELECT substr(d1.id, 4) AS id, d1.latitude.value AS latitude, d1.longitude.value AS longitude, d1.websiteurl.value AS websiteurl, - d1.openairecompatibility.classid AS compatibility + d1.openairecompatibility.classid AS compatibility, + d1.journal.issnprinted AS issn_printed, + d1.journal.issnonline AS issn_online FROM ${openaire_db_name}.datasource d1 LEFT OUTER JOIN (SELECT id, split(originalidd, '\\:')[1] as piwik_id @@ -51,7 +55,7 @@ CREATE TABLE ${stats_db_name}.dual INSERT INTO ${stats_db_name}.dual VALUES ('X'); INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`, - `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`) + `piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`) SELECT 'other', 'Other', 'Repository', @@ -62,7 +66,9 @@ SELECT 'other', NULL, NULL, NULL, - 'unknown' + 'unknown', + null, + null FROM ${stats_db_name}.dual WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository'); DROP TABLE ${stats_db_name}.dual; @@ -97,13 +103,4 @@ where d.datainfo.deletedbyinference = false; CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS SELECT datasource AS id, id AS result -FROM ${stats_db_name}.result_datasources; - --- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource_tmp COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource_languages COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource_oids COMPUTE STATISTICS FOR COLUMNS; --- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS; --- ANALYZE TABLE ${stats_db_name}.datasource_organizations COMPUTE STATISTICS FOR COLUMNS; \ No newline at end of file +FROM ${stats_db_name}.result_datasources; \ No newline at end of file diff --git a/pom.xml b/pom.xml index 6e4526e41..fc4a8a21b 100644 --- a/pom.xml +++ b/pom.xml @@ -205,6 +205,11 @@ dateparser 1.0.7 + + me.xuender + unidecode + 0.0.7 + com.google.guava