diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index b04d62dd2..73efeabb4 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp 1.2.4-SNAPSHOT - ../ + ../pom.xml dhp-schemas diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java new file mode 100644 index 000000000..11bce26c8 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/orcid/OrcidDOI.java @@ -0,0 +1,24 @@ +package eu.dnetlib.dhp.schema.orcid; + +import java.util.List; + +public class OrcidDOI { + private String doi; + private List authors; + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index d1f6d8613..5ba01357e 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -200,7 +200,7 @@ case object Crossref2Oaf { a.setSurname(family) a.setFullname(s"$given $family") if (StringUtils.isNotBlank(orcid)) - a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava) + a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava) a } @@ -248,7 +248,7 @@ case object Crossref2Oaf { def snsfRule(award:String): String = { - var tmp1 = StringUtils.substringAfter(award,"_") + val tmp1 = StringUtils.substringAfter(award,"_") val tmp2 = StringUtils.substringBefore(tmp1,"/") logger.debug(s"From $award to $tmp2") tmp2 @@ -294,7 +294,7 @@ case object Crossref2Oaf { } def getProjectId (nsPrefix:String, targetId:String):String = { - "40|$nsPrefix::$targetId" + s"40|$nsPrefix::$targetId" } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala index 996ba5585..4a39a2987 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefDataset.scala @@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.commons.io.IOUtils +import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} @@ -12,21 +13,23 @@ import org.slf4j.{Logger, LoggerFactory} object CrossrefDataset { + val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) - def extractTimestamp(input:String): Long = { + + def to_item(input:String):CrossrefDT = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - - (json\"indexed"\"timestamp").extractOrElse[Long](0) + val ts:Long = (json \ "indexed" \ "timestamp").extract[Long] + val doi:String = (json \ "DOI").extract[String] + CrossrefDT(doi, input, ts) } - def main(args: Array[String]): Unit = { - val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) + val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) parser.parseArgument(args) @@ -49,9 +52,8 @@ object CrossrefDataset { if (a == null) return b - val tb = extractTimestamp(b.json) - val ta = extractTimestamp(a.json) - if(ta >tb) { + + if(a.timestamp >b.timestamp) { return a } b @@ -63,9 +65,7 @@ object CrossrefDataset { if (a == null) return b - val tb = extractTimestamp(b.json) - val ta = extractTimestamp(a.json) - if(ta >tb) { + if(a.timestamp >b.timestamp) { return a } b @@ -78,15 +78,21 @@ object CrossrefDataset { override def finish(reduction: CrossrefDT): CrossrefDT = reduction } - val sourcePath:String = parser.get("sourcePath") - val targetPath:String = parser.get("targetPath") + val workingPath:String = parser.get("workingPath") - val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT] - ds.groupByKey(_.doi) + val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT] + + + val update = + spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text]) + .map(i =>CrossrefImporter.decompressBlob(i._2.toString)) + .map(i =>to_item(i))) + + main_ds.union(update).groupByKey(_.doi) .agg(crossrefAggregator.toColumn) .map(s=>s._2) - .write.mode(SaveMode.Overwrite).save(targetPath) + .write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated") } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index 08319058c..0272cb1a6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -34,85 +34,21 @@ object SparkMapDumpIntoOAF { implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation] implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset] - val sc = spark.sparkContext val targetPath = parser.get("targetPath") import spark.implicits._ - spark.read.load(parser.get("sourcePath")).as[CrossrefDT] .flatMap(k => Crossref2Oaf.convert(k.json)) .filter(o => o != null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject") - val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf] - ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.save(s"$targetPath/publication") + ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication") - ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.save(s"$targetPath/relation") + ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation") - ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.save(s"$targetPath/dataset") - - - -// -// -// -// sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text]) -// .map(k => k._2.toString).map(CrossrefImporter.decompressBlob) -// .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject") -// -// val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null) -// -// val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication]) -// .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) => -// var r = if (p1 == null) p2 else p1 -// if (p1 != null && p2 != null) { -// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) { -// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp) -// r = p2 -// else -// r = p1 -// } else { -// r = if (p1.getLastupdatetimestamp == null) p2 else p1 -// } -// } -// r -// }.map(_._2) -// -// val pubs:Dataset[Publication] = spark.createDataset(distinctPubs) -// pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication") -// -// -// val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset]) -// .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) => -// var r = if (p1 == null) p2 else p1 -// if (p1 != null && p2 != null) { -// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) { -// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp) -// r = p2 -// else -// r = p1 -// } else { -// r = if (p1.getLastupdatetimestamp == null) p2 else p1 -// } -// } -// r -// }.map(_._2) -// -// spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset") -// -// -// -// val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation]) -// .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r)) -// .reduceByKey { case (p1: Relation, p2: Relation) => -// if (p1 == null) p2 else p1 -// }.map(_._2) -// -// val rels: Dataset[Relation] = spark.createDataset(distinctRels) -// -// rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations") + ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset") } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml index a9cc9ea3c..63c2e9ef2 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml @@ -16,88 +16,86 @@ sparkExecutorCores number of cores used by single executor - - - - + + timestamp + Timestamp for incremental Harvesting + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.crossref.CrossrefImporter + -t${workingPath}/input/crossref/index_update + -n${nameNode} + -ts${timestamp} + + + + + + + + yarn-cluster + cluster + ExtractCrossrefToOAF + eu.dnetlib.doiboost.crossref.CrossrefDataset + dhp-doiboost-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + ${sparkExtraOPT} + + --workingPath/data/doiboost/input/crossref + --masteryarn-cluster + + + + + + + + + + + + + - - - - - - - - - - - - - - - - + yarn-cluster cluster - ExtractCrossrefToOAF + ConvertCrossrefToOAF eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF dhp-doiboost-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 ${sparkExtraOPT} --sourcePath${workingPath}/input/crossref/crossref_ds - --targetPath${workingPath}/input/crossref + --targetPath${workingPath}/process/ --masteryarn-cluster - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json index 312bd0751..23c0fdabc 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json @@ -1,6 +1,5 @@ [ - {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true}, {"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 2a6fd3a1d..fbb20f143 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -190,15 +190,6 @@ public class CleaningFunctions { } } - final Set collectedFrom = Optional - .ofNullable(r.getCollectedfrom()) - .map( - c -> c - .stream() - .map(KeyValue::getKey) - .collect(Collectors.toCollection(HashSet::new))) - .orElse(new HashSet<>()); - for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); diff --git a/dhp-workflows/dhp-stats-promote/pom.xml b/dhp-workflows/dhp-stats-promote/pom.xml new file mode 100644 index 000000000..c64c2f58e --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/pom.xml @@ -0,0 +1,32 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.2.4-SNAPSHOT + + 4.0.0 + dhp-stats-promote + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + false + + + + + diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml new file mode 100644 index 000000000..9331d4ac5 --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -0,0 +1,34 @@ + + + jobTracker + ${jobTracker} + + + nameNode + ${nameNode} + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + oozie.wf.workflow.notification.url + {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + + stats_tool_api_url + ${stats_tool_api_url} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh new file mode 100644 index 000000000..70112dc7b --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh @@ -0,0 +1,18 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +echo "Getting file from " $3 +hdfs dfs -copyToLocal $3 + +echo "Running impala shell make the new database visible" +impala-shell -q "INVALIDATE METADATA;" + +echo "Running impala shell to compute new table stats" +impala-shell -d $1 -f $2 +echo "Impala shell finished" +rm $2 diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/promoteCache.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/promoteCache.sh new file mode 100644 index 000000000..2d28377fb --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/promoteCache.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +curl --request GET $1/cache/promoteCache + diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql new file mode 100644 index 000000000..34e48a18a --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql @@ -0,0 +1,8 @@ +------------------------------------------------------ +------------------------------------------------------ +-- Impala table statistics - Needed to make the tables +-- visible for impala +------------------------------------------------------ +------------------------------------------------------ + +INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql new file mode 100644 index 000000000..48f8d58fd --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql @@ -0,0 +1,207 @@ +------------------------------------------------------ +------------------------------------------------------ +-- Shadow schema table exchange +------------------------------------------------------ +------------------------------------------------------ + +-- Dropping old views +DROP VIEW IF EXISTS ${stats_db_production_name}.category; +DROP VIEW IF EXISTS ${stats_db_production_name}.concept; +DROP VIEW IF EXISTS ${stats_db_production_name}.context; +DROP VIEW IF EXISTS ${stats_db_production_name}.country; +DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp; +DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results; +DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.funder; +DROP VIEW IF EXISTS ${stats_db_production_name}.fundref; +DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country; +DROP VIEW IF EXISTS ${stats_db_production_name}.organization; +DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects; +DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics; +DROP VIEW IF EXISTS ${stats_db_production_name}.project; +DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations; +DROP VIEW IF EXISTS ${stats_db_production_name}.project_results; +DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount; +DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics; +DROP VIEW IF EXISTS ${stats_db_production_name}.result; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics; +DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture; +DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap; +DROP VIEW IF EXISTS ${stats_db_production_name}.software; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources; +DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics; + + +-- Creating the shadow database, in case it doesn't exist +CREATE database IF NOT EXISTS ${stats_db_production_name}; + +-- Creating new views +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; +CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml new file mode 100644 index 000000000..d744f18da --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -0,0 +1,87 @@ + + + + stats_db_name + the target stats database name + + + stats_db_production_name + the name of the production schema + + + stats_tool_api_url + The url of the API of the stats tool. Is used to trigger the cache promote. + + + hive_metastore_uris + hive server metastore URIs + + + hive_jdbc_url + hive server jdbc url + + + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + + + + ${jobTracker} + ${nameNode} + + + hive.metastore.uris + ${hive_metastore_uris} + + + hive.txn.timeout + ${hive_timeout} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${hive_jdbc_url} + + stats_db_name=${stats_db_name} + stats_db_production_name=${stats_db_production_name} + + + + + + + + ${jobTracker} + ${nameNode} + impala-shell.sh + ${stats_db_production_name} + computeProductionStats.sql + ${wf:appPath()}/scripts/computeProductionStats.sql + impala-shell.sh + + + + + + + + ${jobTracker} + ${nameNode} + promoteCache.sh + ${stats_tool_api_url} + promoteCache.sh + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml index 2cd53a37b..9331d4ac5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/config-default.xml @@ -27,4 +27,8 @@ oozie.wf.workflow.notification.url {serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status + + stats_tool_api_url + ${stats_tool_api_url} + \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql index 21a944164..b4745535d 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step6.sql @@ -11,7 +11,7 @@ CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids -- Project_organizations Table DROP TABLE IF EXISTS ${stats_db_name}.project_organizations; -CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization'; +CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization' and r.datainfo.deletedbyinference=false; -- Project_results Table DROP TABLE IF EXISTS ${stats_db_name}.project_results; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index 7acabf1dd..36a4a8a49 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -25,7 +25,7 @@ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * FROM ${stats_db_ CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * FROM ${stats_db_name}.publication_topics UNION ALL SELECT * FROM ${stats_db_name}.software_topics UNION ALL SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; DROP TABLE IF EXISTS ${stats_db_name}.result_organization; -CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization'; +CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization' and r.datainfo.deletedbyinference=false; DROP TABLE IF EXISTS ${stats_db_name}.result_projects; CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id=pr.result JOIN ${stats_db_name}.project_tmp p ON p.id=pr.id; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 4e13b3dd8..197047c8b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -47,7 +47,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids; CREATE TABLE ${stats_db_name}.datasource_oids AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids; DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations; -CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization'; +CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization' and r.datainfo.deletedbyinference=false; -- datasource sources: -- where the datasource info have been collected from. diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh new file mode 100644 index 000000000..36e74a556 --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +curl --request GET $1/cache/updateCache + diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index d6cc14e25..dcd034166 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -17,6 +17,10 @@ stats_db_shadow_name the name of the shadow schema + + stats_tool_api_url + The url of the API of the stats tool. Is used to trigger the cache update. + hive_metastore_uris hive server metastore URIs @@ -255,7 +259,7 @@ ${hive_jdbc_url} - + stats_db_name=${stats_db_name} stats_db_shadow_name=${stats_db_shadow_name} @@ -283,10 +287,22 @@ ${nameNode} impala-shell.sh ${stats_db_shadow_name} - step19.sql - ${wf:appPath()}/scripts/step19.sql + computeProductionStats.sql + ${wf:appPath()}/scripts/computeProductionStats.sql impala-shell.sh + + + + + + + ${jobTracker} + ${nameNode} + updateCache.sh + ${stats_tool_api_url} + updateCache.sh + diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index f1167b184..190c9847e 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -7,7 +7,7 @@ eu.dnetlib.dhp dhp 1.2.4-SNAPSHOT - ../ + ../pom.xml dhp-workflows