Compare commits

..

46 Commits

Author SHA1 Message Date
Michele Artini 6e380993d6 added filter for typenorm field 2024-03-12 10:09:38 +01:00
Michele Artini f9416ab2aa minor 2024-03-12 09:20:37 +01:00
Michele Artini 3e56b88a5f base transformation rule test 2024-03-11 14:48:37 +01:00
Michele Artini dfb05ebedb unitTest 2024-03-11 11:44:04 +01:00
Michele Artini 341c3f798e xslt 2024-03-11 10:50:17 +01:00
Michele Artini 4145065481 mapping for ODF 2024-03-08 15:31:08 +01:00
Michele Artini 5e9102d404 xslt 2024-03-08 14:41:25 +01:00
Michele Artini dc9d642e66 xslt 2024-03-08 12:37:54 +01:00
Michele Artini 932173287a added a base vocabulary 2024-03-08 11:52:58 +01:00
Michele Artini 3b5163d8e3 fixed a xpath 2024-03-08 08:23:07 +01:00
Michele Artini 6648d710a3 xslt 2024-03-07 13:06:41 +01:00
Michele Artini a059747f16 xslt 2024-03-07 10:57:41 +01:00
Michele Artini 1e34585213 file renaming 2024-03-07 09:52:49 +01:00
Michele Artini 108478b778 xslt 2024-03-07 09:34:16 +01:00
Michele Artini 6500151c90 initial version of the BASE Transformation rule 2024-03-05 14:18:57 +01:00
Michele Artini af58cd726e initial version of the BASE Transformation rule 2024-03-05 11:48:36 +01:00
Michele Artini efbb6c37d6 updated sql for base 2024-03-05 09:37:04 +01:00
Michele Artini b206e9a30b sql to create the BASE datasource 2024-03-04 15:13:34 +01:00
Michele Artini db6f774394 selection of the new plugin 2024-03-04 10:43:40 +01:00
Michele Artini 9506d80ddc changed sql to select accepted datasources 2024-03-04 08:25:40 +01:00
Michele Artini c2b6841eb0 opendoar datasource filter 2024-03-01 15:32:56 +01:00
Michele Artini be7f327e88 opendoar datasource filter 2024-03-01 13:38:36 +01:00
Michele Artini 32f4d6f691 reports for types 2024-03-01 11:43:37 +01:00
Michele Artini 71204a8056 some fields in stats 2024-02-29 10:17:31 +01:00
Michele Artini 5ddbef3a5b new stats 2024-02-28 14:34:09 +01:00
Michele Artini 04dd31139b separator in sql query 2024-02-28 11:12:25 +01:00
Michele Artini 3d14bef381 OpenDoar reports 2024-02-28 10:51:13 +01:00
Michele Artini f8cf7ffbcb stats 2024-02-22 14:01:11 +01:00
Michele Artini d2b7541583 fixed a problem with Dataset model 2024-02-16 11:36:46 +01:00
Michele Artini 8ffdd9747d added id to BaseRecordInfo 2024-02-15 14:27:50 +01:00
Michele Artini da65728afe produce a parquet file 2024-02-15 14:04:17 +01:00
Michele Artini e254720377 fixed path reports 2024-02-15 08:52:28 +01:00
Michele Artini 8d85c1e97e used a parser STAX 2024-02-15 08:21:52 +01:00
Michele Artini b42e2b4d61 fixed log class 2024-02-14 15:52:31 +01:00
Michele Artini 773346f638 increased memory 2024-02-14 14:40:27 +01:00
Michele Artini 2e11197142 removed invalid deletion 2024-02-14 11:59:30 +01:00
Michele Artini ddd6a7ceb3 minor fixes 2024-02-14 11:39:37 +01:00
Michele Artini 963a2500be new reports in hadoop job 2024-02-14 10:37:39 +01:00
Michele Artini 4b1ecad4e2 prepared a job to analyze the BASE records 2024-02-13 13:48:26 +01:00
Michele Artini dd7350ecf2 fixed a problem with xpaths 2024-02-13 08:36:42 +01:00
Michele Artini 265bfd364d refactoing 2024-02-12 15:35:36 +01:00
Michele Artini 16766c514e refactoring 2024-02-12 12:19:57 +01:00
Michele Artini 5add433b74 partial refactoring 2024-02-09 14:33:04 +01:00
Michele Artini c974c75f83 partial refactoring 2024-02-09 12:36:20 +01:00
Michele Artini c6db6335b9 prepare filtering for base import 2024-02-06 15:10:29 +01:00
Michele Artini abcd81bba0 first implementation of the collection plugin for BASE 2024-02-05 15:19:41 +01:00
86 changed files with 1596 additions and 3998 deletions

View File

@ -312,8 +312,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
}
if (value instanceof Datasource) {
final Datasource d = (Datasource) value;
return Objects.nonNull(d.getOfficialname()) && StringUtils.isNotBlank(d.getOfficialname().getValue());
// nothing to evaluate here
} else if (value instanceof Project) {
final Project p = (Project) value;
return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue());

View File

@ -1,39 +0,0 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.actionmanager.promote;
/** Encodes the Actionset promotion strategies */
public class PromoteAction {
/** The supported actionset promotion strategies
*
* ENRICH: promotes only records in the actionset matching another record in the
* graph and enriches them applying the given MergeAndGet strategy
* UPSERT: promotes all the records in an actionset, matching records are updated
* using the given MergeAndGet strategy, the non-matching record as inserted as they are.
*/
public enum Strategy {
ENRICH, UPSERT
}
/**
* Returns the string representation of the join type implementing the given PromoteAction.
*
* @param strategy the strategy to be used to promote the Actionset contents
* @return the join type used to implement the promotion strategy
*/
public static String joinTypeForStrategy(PromoteAction.Strategy strategy) {
switch (strategy) {
case ENRICH:
return "left_outer";
case UPSERT:
return "full_outer";
default:
throw new IllegalStateException("unsupported PromoteAction: " + strategy.toString());
}
}
}

View File

@ -67,9 +67,8 @@ public class PromoteActionPayloadForGraphTableJob {
String outputGraphTablePath = parser.get("outputGraphTablePath");
logger.info("outputGraphTablePath: {}", outputGraphTablePath);
MergeAndGet.Strategy mergeAndGetStrategy = MergeAndGet.Strategy
.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("mergeAndGetStrategy: {}", mergeAndGetStrategy);
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("strategy: {}", strategy);
Boolean shouldGroupById = Optional
.ofNullable(parser.get("shouldGroupById"))
@ -77,12 +76,6 @@ public class PromoteActionPayloadForGraphTableJob {
.orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById);
PromoteAction.Strategy promoteActionStrategy = Optional
.ofNullable(parser.get("promoteActionStrategy"))
.map(PromoteAction.Strategy::valueOf)
.orElse(PromoteAction.Strategy.UPSERT);
logger.info("promoteActionStrategy: {}", promoteActionStrategy);
@SuppressWarnings("unchecked")
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
@SuppressWarnings("unchecked")
@ -104,8 +97,7 @@ public class PromoteActionPayloadForGraphTableJob {
inputGraphTablePath,
inputActionPayloadPath,
outputGraphTablePath,
mergeAndGetStrategy,
promoteActionStrategy,
strategy,
rowClazz,
actionPayloadClazz,
shouldGroupById);
@ -132,16 +124,14 @@ public class PromoteActionPayloadForGraphTableJob {
String inputGraphTablePath,
String inputActionPayloadPath,
String outputGraphTablePath,
MergeAndGet.Strategy mergeAndGetStrategy,
PromoteAction.Strategy promoteActionStrategy,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
Dataset<G> result = promoteActionPayloadForGraphTable(
rowDS, actionPayloadDS, mergeAndGetStrategy, promoteActionStrategy, rowClazz, actionPayloadClazz,
shouldGroupById)
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
saveGraphTable(result, outputGraphTablePath);
@ -193,8 +183,7 @@ public class PromoteActionPayloadForGraphTableJob {
private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
Dataset<G> rowDS,
Dataset<A> actionPayloadDS,
MergeAndGet.Strategy mergeAndGetStrategy,
PromoteAction.Strategy promoteActionStrategy,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz,
Boolean shouldGroupById) {
@ -206,9 +195,8 @@ public class PromoteActionPayloadForGraphTableJob {
SerializableSupplier<Function<G, String>> rowIdFn = ModelSupport::idFn;
SerializableSupplier<Function<A, String>> actionPayloadIdFn = ModelSupport::idFn;
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet
.functionFor(mergeAndGetStrategy);
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(mergeAndGetStrategy);
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
@ -219,7 +207,6 @@ public class PromoteActionPayloadForGraphTableJob {
rowIdFn,
actionPayloadIdFn,
mergeRowWithActionPayloadAndGetFn,
promoteActionStrategy,
rowClazz,
actionPayloadClazz);

View File

@ -34,7 +34,6 @@ public class PromoteActionPayloadFunctions {
* @param rowIdFn Function used to get the id of graph table row
* @param actionPayloadIdFn Function used to get id of action payload instance
* @param mergeAndGetFn Function used to merge graph table row and action payload instance
* @param promoteActionStrategy the Actionset promotion strategy
* @param rowClazz Class of graph table
* @param actionPayloadClazz Class of action payload
* @param <G> Type of graph table row
@ -47,7 +46,6 @@ public class PromoteActionPayloadFunctions {
SerializableSupplier<Function<G, String>> rowIdFn,
SerializableSupplier<Function<A, String>> actionPayloadIdFn,
SerializableSupplier<BiFunction<G, A, G>> mergeAndGetFn,
PromoteAction.Strategy promoteActionStrategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz) {
if (!isSubClass(rowClazz, actionPayloadClazz)) {
@ -63,7 +61,7 @@ public class PromoteActionPayloadFunctions {
.joinWith(
actionPayloadWithIdDS,
rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")),
PromoteAction.joinTypeForStrategy(promoteActionStrategy))
"full_outer")
.map(
(MapFunction<Tuple2<Tuple2<String, G>, Tuple2<String, A>>, G>) value -> {
Optional<G> rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2);

View File

@ -41,12 +41,6 @@
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
"paramRequired": true
},
{
"paramName": "pas",
"paramLongName": "promoteActionStrategy",
"paramDescription": "strategy for promoting the actionset contents into the graph tables, ENRICH or UPSERT (default)",
"paramRequired": false
},
{
"paramName": "sgid",
"paramLongName": "shouldGroupById",

View File

@ -115,7 +115,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
@ -168,7 +167,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>

View File

@ -106,7 +106,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/datasource</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -106,7 +106,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/organization</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -114,7 +114,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
@ -167,7 +166,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>

View File

@ -106,7 +106,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/project</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -115,7 +115,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
@ -168,7 +167,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>

View File

@ -107,7 +107,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/relation</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -114,7 +114,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
@ -167,7 +166,6 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>

View File

@ -54,7 +54,7 @@ public class PromoteActionPayloadFunctionsTest {
RuntimeException.class,
() -> PromoteActionPayloadFunctions
.joinGraphTableWithActionPayloadAndMerge(
null, null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
}
@Test
@ -104,7 +104,6 @@ public class PromoteActionPayloadFunctionsTest {
rowIdFn,
actionPayloadIdFn,
mergeAndGetFn,
PromoteAction.Strategy.UPSERT,
OafImplSubSub.class,
OafImplSubSub.class)
.collectAsList();
@ -184,7 +183,6 @@ public class PromoteActionPayloadFunctionsTest {
rowIdFn,
actionPayloadIdFn,
mergeAndGetFn,
PromoteAction.Strategy.UPSERT,
OafImplSubSub.class,
OafImplSub.class)
.collectAsList();

View File

@ -58,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin();
@ -68,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
fileSystem.getConf(),
SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class),
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.keyClass(IntWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
plugin
.collect(api, report)
.forEach(
content -> {
key.set(counter.getAndIncrement());
value.set(content);
try {
writer.append(key, value);
} catch (Throwable e) {
throw new RuntimeException(e);
}
});
} catch (Throwable e) {
report.put(e.getClass().getName(), e.getMessage());
.collect(this.api, this.report)
.forEach(content -> {
key.set(counter.getAndIncrement());
value.set(content);
try {
writer.append(key, value);
} catch (final Throwable e) {
throw new RuntimeException(e);
}
});
} catch (final Throwable e) {
this.report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e);
} finally {
shutdown();
report.ongoing(counter.longValue(), counter.longValue());
this.report.ongoing(counter.longValue(), counter.longValue());
}
}
private void scheduleReport(AtomicInteger counter) {
private void scheduleReport(final AtomicInteger counter) {
schedule(new ReporterCallback() {
@Override
public Long getCurrent() {
return counter.longValue();
@ -112,33 +112,33 @@ public class CollectorWorker extends ReportingJob {
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) {
case oai:
return new OaiCollectorPlugin(clientParams);
return new OaiCollectorPlugin(this.clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
return new RestCollectorPlugin(this.clientParams);
case file:
return new FileCollectorPlugin(fileSystem);
return new FileCollectorPlugin(this.fileSystem);
case fileGzip:
return new FileGZipCollectorPlugin(fileSystem);
return new FileGZipCollectorPlugin(this.fileSystem);
case baseDump:
return new BaseCollectorPlugin(this.fileSystem);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))
.ofNullable(this.api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
switch (plugin) {
case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(fileSystem);
return new MongoDbDumpCollectorPlugin(this.fileSystem);
case mdstore_mongodb:
return new MDStoreCollectorPlugin();
default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
}
default:
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
}
}

View File

@ -0,0 +1,379 @@
package eu.dnetlib.dhp.collection.plugin.base;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.count;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.DeflateCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import scala.Tuple2;
public class BaseAnalyzerJob {
private static final String BASE_DUMP = "BASE_DUMP";
private static final Logger log = LoggerFactory.getLogger(BaseAnalyzerJob.class);
public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils
.toString(
BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String dataPath = parser.get("dataPath");
log.info("dataPath {}: ", dataPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String opendoarPath = parser.get("opendoarPath");
log.info("opendoarPath {}: ", opendoarPath);
final String typesReportPath = parser.get("typesReportPath");
log.info("typesReportPath {}: ", typesReportPath);
final int fromStep = Integer.parseInt(parser.get("fromStep"));
log.info("fromStep {}: ", fromStep);
final String dbUrl = parser.get("postgresUrl");
log.info("postgresUrl {}: ", dbUrl);
final String dbUser = parser.get("postgresUser");
log.info("postgresUser {}: ", dbUser);
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword {}: ", dbPassword);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
if (fromStep <= 0) {
log
.info(
"\n**************************************\n* EXECUTING STEP 0: LoadRecords\n**************************************");
loadRecords(inputPath, dataPath);
log
.info(
"\n**************************************\n* EXECUTING STEP 0: DONE\n**************************************");
}
if (fromStep <= 1) {
log
.info(
"\n**************************************\n* EXECUTING STEP 1: Base Report\n**************************************");
generateReport(spark, dataPath, outputPath);
log
.info(
"\n**************************************\n* EXECUTING STEP 1: DONE\n**************************************");
}
if (fromStep <= 2) {
log
.info(
"\n**************************************\n* EXECUTING STEP 2: OpenDOAR Report\n**************************************");
generateOpenDoarReport(spark, outputPath, opendoarPath, loadOpenDoarStats(dbUrl, dbUser, dbPassword));
log
.info(
"\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
}
if (fromStep <= 3) {
log
.info(
"\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
generateVocTypeReport(spark, outputPath, typesReportPath);
log
.info(
"\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
}
});
}
private static void generateVocTypeReport(final SparkSession spark,
final String reportPath,
final String typesReportPath) {
spark
.read()
.parquet(reportPath)
.as(Encoders.bean(BaseRecordInfo.class))
.flatMap(rec -> {
final List<Tuple2<String, String>> list = new ArrayList<>();
for (final String t1 : rec.getTypes()) {
if (t1.startsWith("TYPE_NORM:")) {
for (final String t2 : rec.getTypes()) {
if (t2.startsWith("TYPE:")) {
list
.add(
new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
StringUtils.substringAfter(t2, "TYPE:").trim()));
}
}
}
}
return list.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.distinct()
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(typesReportPath);
}
private static void generateOpenDoarReport(final SparkSession spark,
final String reportPath,
final String opendoarPath,
final List<OpenDoarRepoStatus> repos) {
final Dataset<OpenDoarRepoStatus> fromDB = spark.createDataset(repos, Encoders.bean(OpenDoarRepoStatus.class));
final Dataset<OpenDoarRepoStatus> fromBASE = spark
.read()
.parquet(reportPath)
.selectExpr("explode(collections) as collection")
.where("isnotnull(collection.opendoarId) and character_length(collection.opendoarId)>0")
.selectExpr("concat('opendoar____::',collection.opendoarId) as id")
.groupBy(col("id"))
.agg(count(col("id")))
.map(row -> {
final OpenDoarRepoStatus repo = new OpenDoarRepoStatus();
repo.setId(row.getString(0));
repo.getAggregations().put(BASE_DUMP, row.getLong(1));
repo.setBaseCount(row.getLong(1));
repo.setOpenaireCount(0);
repo.setHighCompliance(false);
return repo;
}, Encoders.bean(OpenDoarRepoStatus.class));
fromDB
.joinWith(fromBASE, fromDB.col("id").equalTo(fromBASE.col("id")), "full_outer")
.map(t -> merge(t._1, t._2), Encoders.bean(OpenDoarRepoStatus.class))
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(opendoarPath);
}
private static OpenDoarRepoStatus merge(final OpenDoarRepoStatus r1, final OpenDoarRepoStatus r2) {
if (r1 == null) {
return r2;
}
if (r2 == null) {
return r1;
}
final OpenDoarRepoStatus r = new OpenDoarRepoStatus();
r.setId(ObjectUtils.firstNonNull(r1.getId(), r2.getId()));
r.setJurisdiction(ObjectUtils.firstNonNull(r1.getJurisdiction(), r2.getJurisdiction()));
r.getAggregations().putAll(r1.getAggregations());
r.getAggregations().putAll(r2.getAggregations());
r.setHighCompliance(r1.isHighCompliance() || r2.isHighCompliance());
r.setBaseCount(Math.max(r1.getBaseCount(), r2.getBaseCount()));
r.setOpenaireCount(Math.max(r1.getOpenaireCount(), r2.getOpenaireCount()));
return r;
}
private static List<OpenDoarRepoStatus> loadOpenDoarStats(final String dbUrl,
final String dbUser,
final String dbPassword) throws Exception {
final List<OpenDoarRepoStatus> repos = new ArrayList<>();
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(
BaseAnalyzerJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql"));
dbClient.processResults(sql, row -> {
try {
final OpenDoarRepoStatus repo = new OpenDoarRepoStatus();
repo.setId(row.getString("id"));
repo.setJurisdiction(row.getString("jurisdiction"));
repo.setBaseCount(0);
repo.setHighCompliance(false);
long sum = 0;
for (final String s : (String[]) row.getArray("aggregations").getArray()) {
final String api = StringUtils.substringBefore(s, "@@@");
final long count = NumberUtils.toLong(StringUtils.substringAfter(s, "@@@"), 0);
sum += count;
repo.getAggregations().put(api, count);
// This should recognize the HIGH Compliances: openaire*X.Y*
if (s.contains("compliance: openaire")) {
repo.setHighCompliance(true);
}
}
repo.setOpenaireCount(sum);
repos.add(repo);
log.info("# FOUND OPENDOAR (DB): " + repo.getId());
} catch (final SQLException e) {
log.error("Error in SQL", e);
throw new RuntimeException("Error in SQL", e);
}
});
}
return repos;
}
private static void loadRecords(final String inputPath, final String outputPath) throws Exception {
try (final FileSystem fs = FileSystem.get(new Configuration());
final AggregatorReport report = new AggregatorReport()) {
final AtomicLong recordsCounter = new AtomicLong(0);
final LongWritable key = new LongWritable();
final Text value = new Text();
try (final SequenceFile.Writer writer = SequenceFile
.createWriter(
fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.keyClass(LongWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final BaseCollectorIterator iteraror = new BaseCollectorIterator(fs, new Path(inputPath), report);
while (iteraror.hasNext()) {
final String record = iteraror.next();
final long i = recordsCounter.incrementAndGet();
if ((i % 10000) == 0) {
log.info("# Loaded records: " + i);
}
key.set(i);
value.set(record);
try {
writer.append(key, value);
} catch (final Throwable e1) {
throw new RuntimeException(e1);
}
}
log.info("# COMPLETED - Loaded records: " + recordsCounter.get());
}
}
}
private static void generateReport(final SparkSession spark,
final String inputPath,
final String targetPath) throws Exception {
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputPath, LongWritable.class, Text.class)
.map(s -> s._2.toString())
.map(BaseAnalyzerJob::extractInfo);
spark
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class))
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
protected static BaseRecordInfo extractInfo(final String s) {
try {
final Document record = DocumentHelper.parseText(s);
final BaseRecordInfo info = new BaseRecordInfo();
final Set<String> paths = new LinkedHashSet<>();
final Set<String> types = new LinkedHashSet<>();
final List<BaseCollectionInfo> colls = new ArrayList<>();
for (final Object o : record.selectNodes("//*|//@*")) {
paths.add(((Node) o).getPath());
if (o instanceof Element) {
final Element n = (Element) o;
final String nodeName = n.getName();
if ("collection".equals(nodeName)) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName)) {
final BaseCollectionInfo coll = new BaseCollectionInfo();
coll.setId(collName);
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
coll.setRorId(n.valueOf("@ror_id").trim());
colls.add(coll);
}
} else if ("type".equals(nodeName)) {
types.add("TYPE: " + n.getText().trim());
} else if ("typenorm".equals(nodeName)) {
types.add("TYPE_NORM: " + n.getText().trim());
}
}
}
info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
info.getTypes().addAll(types);
info.getPaths().addAll(paths);
info.setCollections(colls);
return info;
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -45,22 +45,15 @@ public class BaseCollectorPlugin implements CollectorPlugin {
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
// the path of the dump file on HDFS
// http://oai.base-search.net/initial_load/base_oaipmh_dump-current.tar
// it could be downloaded from iis-cdh5-test-gw.ocean.icm.edu.pl and then copied on HDFS
// get path to file
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
// get the parameters for the connection to the OpenAIRE database.
// the database is used to obtain the list of the datasources that the plugin will collect
final String dbUrl = api.getParams().get("dbUrl");
final String dbUser = api.getParams().get("dbUser");
final String dbPassword = api.getParams().get("dbPassword");
// the types(comma separated, empty value for all) that the plugin will collect,
// the types should be expressed in the format of the normalized types of BASE (for example 1,121,...)
final String acceptedNormTypesString = api.getParams().get("acceptedNormTypes");
log.info("baseUrl: {}", filePath);
@ -70,9 +63,7 @@ public class BaseCollectorPlugin implements CollectorPlugin {
log.info("acceptedNormTypes: {}", acceptedNormTypesString);
try {
if (!this.fs.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
if (!this.fs.exists(filePath)) { throw new CollectorException("path does not exist: " + filePath); }
} catch (final Throwable e) {
throw new CollectorException(e);
}
@ -91,19 +82,19 @@ public class BaseCollectorPlugin implements CollectorPlugin {
final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport
.stream(spliterator, false)
.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
.stream(spliterator, false)
.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
}
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
throws CollectorException {
throws CollectorException {
final Set<String> accepted = new HashSet<>();
try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
.toString(BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
dbClient.processResults(sql, row -> {
try {
@ -127,26 +118,20 @@ public class BaseCollectorPlugin implements CollectorPlugin {
}
protected static boolean filterXml(final String xml,
final Set<String> acceptedOpendoarIds,
final Set<String> acceptedNormTypes) {
final Set<String> acceptedOpendoarIds,
final Set<String> acceptedNormTypes) {
try {
final Document doc = DocumentHelper.parseText(xml);
final String id = doc.valueOf("//*[local-name()='collection']/@opendoar_id").trim();
if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) {
return false;
}
if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) { return false; }
if (acceptedNormTypes.isEmpty()) {
return true;
}
if (acceptedNormTypes.isEmpty()) { return true; }
for (final Object s : doc.selectNodes("//*[local-name()='typenorm']")) {
if (acceptedNormTypes.contains(((Node) s).getText().trim())) {
return true;
}
if (acceptedNormTypes.contains(((Node) s).getText().trim())) { return true; }
}
return false;

View File

@ -0,0 +1,71 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class OpenDoarRepoStatus implements Serializable {
private static final long serialVersionUID = 4832658700366871160L;
private String id;
private String jurisdiction;
private boolean highCompliance = false;
private long baseCount = 0;
private long openaireCount = 0;
private Map<String, Long> aggregations = new HashMap<>();
public String getId() {
return this.id;
}
public void setId(final String id) {
this.id = id;
}
public String getJurisdiction() {
return this.jurisdiction;
}
public void setJurisdiction(final String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public Map<String, Long> getAggregations() {
return this.aggregations;
}
public void setAggregations(final Map<String, Long> aggregations) {
this.aggregations = aggregations;
}
public boolean isHighCompliance() {
return this.highCompliance;
}
public void setHighCompliance(final boolean highCompliance) {
this.highCompliance = highCompliance;
}
public long getOpenaireCount() {
return this.openaireCount;
}
public void setOpenaireCount(final long openaireCount) {
this.openaireCount = openaireCount;
}
public long getBaseCount() {
return this.baseCount;
}
public void setBaseCount(final long baseCount) {
this.baseCount = baseCount;
}
}

View File

@ -0,0 +1,56 @@
[
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the path of the BASE dump",
"paramRequired": true
},
{
"paramName": "d",
"paramLongName": "dataPath",
"paramDescription": "the path of the loaded records",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the generated the report",
"paramRequired": true
},
{
"paramName": "od",
"paramLongName": "opendoarPath",
"paramDescription": "the path of the generated the OpenDOAR report",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "typesReportPath",
"paramDescription": "the path of the generated the types report",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "fromStep",
"paramDescription": "the initial step (numeric, 0 for ALL STEPS)",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,58 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>10G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,79 @@
<workflow-app name="Analyze_BASE_Records" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>baseInputPath</name>
<description>the path of the BASE dump</description>
</property>
<property>
<name>baseDataPath</name>
<description>the path where to store BASE records</description>
</property>
<property>
<name>baseReportsPath</name>
<description>path where to store the reports</description>
</property>
<property>
<name>baseOpenDoarReportsPath</name>
<description>path where to store the OpenDOAR reports</description>
</property>
<property>
<name>baseTypesReportPath</name>
<description>path of the generated the types report</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>baseFromStep</name>
<description>the initial step (numeric, 0 for ALL STEPS)</description>
</property>
</parameters>
<start to="analyzeBaseRecords"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="analyzeBaseRecords">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>AnalyzeBaseRecords</name>
<class>eu.dnetlib.dhp.collection.plugin.base.BaseAnalyzerJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${baseInputPath}</arg>
<arg>--dataPath</arg><arg>${baseDataPath}</arg>
<arg>--outputPath</arg><arg>${baseReportsPath}</arg>
<arg>--opendoarPath</arg><arg>${baseOpenDoarReportsPath}</arg>
<arg>--typesReportPath</arg><arg>${baseTypesReportPath}</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--fromStep</arg><arg>${baseFromStep}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -99,16 +99,4 @@ INSERT INTO dsm_apiparams(
'***'
);
INSERT INTO dsm_apiparams(
_dnet_resource_identifier_,
api,
param,
value
) VALUES (
'api_________::openaire____::base_search::dump@@acceptedNormTypes',
'api_________::openaire____::base_search::dump',
'acceptedNormTypes',
'1,11,111,121,13,14,15,18,181,182,183,1A,6,7'
);
COMMIT;

View File

@ -2,8 +2,6 @@ select s.id as id
from dsm_services s
where collectedfrom = 'openaire____::opendoar'
and jurisdiction = 'Institutional'
and s.id in (
select service from dsm_api where coalesce(compatibility_override, compatibility) = 'driver' or coalesce(compatibility_override, compatibility) = 'UNKNOWN'
) and s.id not in (
select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%'
);
and s.id not in (
select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%' or last_collection_total > 0
);

View File

@ -1048,10 +1048,5 @@
"openaire_id": "re3data_____::r3d100010399",
"datacite_name": "ZEW Forschungsdatenzentrum",
"official_name": "ZEW Forschungsdatenzentrum"
},
"HBP.NEUROINF": {
"openaire_id": "fairsharing_::2975",
"datacite_name": "EBRAINS",
"official_name": "EBRAINS"
}
}

View File

@ -6,7 +6,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@ -22,7 +21,6 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;
@ -119,7 +117,7 @@ public class BaseCollectorIteratorTest {
final List<BaseRecordInfo> ls = new ArrayList<>();
for (int i = 0; i < 10; i++) {
ls.add(extractInfo(xml));
ls.add(BaseAnalyzerJob.extractInfo(xml));
}
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
@ -133,52 +131,4 @@ public class BaseCollectorIteratorTest {
df.show(false);
}
private BaseRecordInfo extractInfo(final String s) {
try {
final Document record = DocumentHelper.parseText(s);
final BaseRecordInfo info = new BaseRecordInfo();
final Set<String> paths = new LinkedHashSet<>();
final Set<String> types = new LinkedHashSet<>();
final List<BaseCollectionInfo> colls = new ArrayList<>();
for (final Object o : record.selectNodes("//*|//@*")) {
paths.add(((Node) o).getPath());
if (o instanceof Element) {
final Element n = (Element) o;
final String nodeName = n.getName();
if ("collection".equals(nodeName)) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName)) {
final BaseCollectionInfo coll = new BaseCollectionInfo();
coll.setId(collName);
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
coll.setRorId(n.valueOf("@ror_id").trim());
colls.add(coll);
}
} else if ("type".equals(nodeName)) {
types.add("TYPE: " + n.getText().trim());
} else if ("typenorm".equals(nodeName)) {
types.add("TYPE_NORM: " + n.getText().trim());
}
}
}
info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
info.getTypes().addAll(types);
info.getPaths().addAll(paths);
info.setCollections(colls);
return info;
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.collection.plugin.base;
import static org.junit.jupiter.api.Assertions.assertFalse;

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.IOException;
@ -66,9 +65,9 @@ public class BaseTransfomationTest extends AbstractVocabularyTest {
private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception {
final String xslt = new SAXReader()
.read(this.getClass().getResourceAsStream(path))
.selectSingleNode("//CODE/*")
.asXML();
.read(this.getClass().getResourceAsStream(path))
.selectSingleNode("//CODE/*")
.asXML();
final LongAccumulator la = new LongAccumulator();

View File

@ -122,41 +122,22 @@ public class DedupRecordFactory {
}
return Stream
.concat(
Stream
.of(agg.getDedupId())
.map(id -> createDedupOafEntity(id, agg.entity, dataInfo, ts)),
agg.aliases
.stream()
.map(id -> createMergedDedupAliasOafEntity(id, agg.entity, dataInfo, ts)))
.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
.map(id -> {
try {
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
res.setId(id);
res.setDataInfo(dataInfo);
res.setLastupdatetimestamp(ts);
return res;
} catch (Exception e) {
throw new RuntimeException(e);
}
})
.iterator();
}, beanEncoder);
}
private static OafEntity createDedupOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
try {
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
res.setId(id);
res.setDataInfo(dataInfo);
res.setLastupdatetimestamp(ts);
return res;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private static OafEntity createMergedDedupAliasOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
try {
OafEntity res = createDedupOafEntity(id, base, dataInfo, ts);
DataInfo ds = (DataInfo) BeanUtils.cloneBean(dataInfo);
ds.setDeletedbyinference(true);
res.setDataInfo(ds);
return res;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
if (duplicate == null) {

View File

@ -15,12 +15,4 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>pivotHistoryDatabase</name>
<value>&#x200B;</value>
</property>
</configuration>

View File

@ -198,8 +198,6 @@
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
</spark>
<ok to="PrepareOrgRels"/>
<error to="Kill"/>

View File

@ -73,6 +73,12 @@
"name": "Irish Nephrology Society",
"synonym": []
},
{
"id": "100011062",
"uri": "http://dx.doi.org/10.13039/100011062",
"name": "Asian Spinal Cord Network",
"synonym": []
},
{
"id": "100011096",
"uri": "http://dx.doi.org/10.13039/100011096",
@ -217,6 +223,12 @@
"name": "Global Brain Health Institute",
"synonym": []
},
{
"id": "100015776",
"uri": "http://dx.doi.org/10.13039/100015776",
"name": "Health and Social Care Board",
"synonym": []
},
{
"id": "100015992",
"uri": "http://dx.doi.org/10.13039/100015992",
@ -391,6 +403,18 @@
"name": "Irish Hospice Foundation",
"synonym": []
},
{
"id": "501100001596",
"uri": "http://dx.doi.org/10.13039/501100001596",
"name": "Irish Research Council for Science, Engineering and Technology",
"synonym": []
},
{
"id": "501100001597",
"uri": "http://dx.doi.org/10.13039/501100001597",
"name": "Irish Research Council for the Humanities and Social Sciences",
"synonym": []
},
{
"id": "501100001598",
"uri": "http://dx.doi.org/10.13039/501100001598",
@ -491,7 +515,7 @@
"id": "501100002081",
"uri": "http://dx.doi.org/10.13039/501100002081",
"name": "Irish Research Council",
"synonym": ["501100001596", "501100001597"]
"synonym": []
},
{
"id": "501100002736",

View File

@ -587,15 +587,7 @@ case object Crossref2Oaf {
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
"10.13039/501100013589" | "10.13039/501100000271" =>
generateSimpleRelationFromAward(funder, "ukri________", a => a)
//HFRI
case "10.13039/501100013209" =>
generateSimpleRelationFromAward(funder, "hfri________", a => a)
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
//ERASMUS+
case "10.13039/501100010790" =>
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
case _ => logger.debug("no match for " + funder.DOI.get)
}

View File

@ -23,10 +23,15 @@ class CrossrefMappingTest {
val mapper = new ObjectMapper()
@Test
def testMissingAuthorParser():Unit = {
val json: String = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/s41567-022-01757-y.json")).mkString
def testMissingAuthorParser(): Unit = {
val json: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/s41567-022-01757-y.json"))
.mkString
val result = Crossref2Oaf.convert(json)
result.filter(o => o.isInstanceOf[Publication]).map(p=> p.asInstanceOf[Publication]).foreach(p =>assertTrue(p.getAuthor.size()>0))
result
.filter(o => o.isInstanceOf[Publication])
.map(p => p.asInstanceOf[Publication])
.foreach(p => assertTrue(p.getAuthor.size() > 0))
}
@Test

View File

@ -53,8 +53,6 @@ public class Constraints implements Serializable {
for (Constraint sc : constraint) {
boolean verified = false;
if(!param.containsKey(sc.getField()))
return false;
for (String value : param.get(sc.getField())) {
if (sc.verifyCriteria(value.trim())) {
verified = true;

View File

@ -317,7 +317,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
p.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB

View File

@ -238,23 +238,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
(Element) doc
.selectSingleNode(
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
.map(e -> {
final String resourceTypeURI = Optional
.ofNullable(e.attributeValue("uri"))
.filter(StringUtils::isNotBlank)
.orElse(null);
final String resourceTypeAnyURI = Optional
.ofNullable(e.attributeValue("anyURI"))
.filter(StringUtils::isNotBlank)
.orElse(null);
final String resourceTypeTxt = Optional
.ofNullable(e.getText())
.filter(StringUtils::isNotBlank)
.orElse(null);
final String resourceTypeGeneral = Optional
.ofNullable(e.attributeValue("resourceTypeGeneral"))
.filter(StringUtils::isNotBlank)
.orElse(null);
.map(element -> {
final String resourceTypeURI = element.attributeValue("uri");
final String resourceTypeAnyURI = element.attributeValue("anyURI");
final String resourceTypeTxt = element.getText();
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
return ObjectUtils
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);

View File

@ -33,7 +33,7 @@ SELECT
dc.officialname AS collectedfromname,
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree

View File

@ -33,7 +33,7 @@ SELECT
dc.officialname AS collectedfromname,
p.contracttypeclass || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p

View File

@ -93,8 +93,8 @@ object CopyHdfsOafSparkApplication {
hasSource != null && hasTarget != null
} else {
val hasId = (json \ "id").extractOrElse[String](null)
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String]("")
hasId != null && oafType.startsWith(resultType)
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null)
hasId != null && oafType.equalsIgnoreCase(resultType)
}
}

View File

@ -59,19 +59,7 @@ public class CopyHdfsOafSparkApplicationTest {
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json")),
"publication"));
}
@Test
void isOafType_Datacite_ORP() throws IOException {
assertTrue(
CopyHdfsOafSparkApplication
.isOafType(
IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json")),
"otherresearchproduct"));
}
}

View File

@ -1171,34 +1171,6 @@ class MappersTest {
}
@Test
void test_Zenodo2() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo2.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size());
Publication p = cleanup((Publication) list.get(0), vocs);
assertNotNull(p.getInstance());
assertEquals(1, p.getInstance().size());
final Instance instance = p.getInstance().get(0);
assertNotNull(instance.getInstanceTypeMapping());
assertEquals(1, instance.getInstanceTypeMapping().size());
Optional<InstanceTypeMapping> coarType = instance
.getInstanceTypeMapping()
.stream()
.filter(itm -> ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
.findFirst();
assertTrue(coarType.isPresent());
assertNotNull(coarType.get().getOriginalType());
assertNull(coarType.get().getTypeCode());
assertNull(coarType.get().getTypeLabel());
}
@Test
void testROHub2() throws IOException {
final String xml = IOUtils
@ -1257,7 +1229,7 @@ class MappersTest {
}
@Test
void testD4ScienceTraining() throws IOException {
public void testD4ScienceTraining() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-1-training.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1268,7 +1240,7 @@ class MappersTest {
}
@Test
void testD4ScienceDataset() throws IOException {
public void testD4ScienceDataset() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-2-dataset.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);

View File

@ -1,59 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-3"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>oai:zenodo.org:1596086</identifier>
<datestamp>2020-01-20T13:50:28Z</datestamp>
<setSpec>openaire</setSpec>
<dr:dateOfTransformation>2024-02-08T11:03:10.994Z</dr:dateOfTransformation>
<dri:objIdentifier>od______2659::036d5555a6688ed00c8d0da97bdece3b</dri:objIdentifier>
<dri:dateOfCollection>2024-02-08T11:03:10.994Z</dri:dateOfCollection>
<dri:dateOfTransformation>2024-02-08T11:03:10.994Z</dri:dateOfTransformation>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
<identifier identifierType="URL">https://zenodo.org/record/1596086</identifier>
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3"/>
<creators>
<creator>
<creatorName>Bonney, T. G.</creatorName>
<givenName>T. G.</givenName>
<familyName>Bonney</familyName>
</creator>
</creators>
<titles>
<title>Ice Blocks on a Moraine</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>1889</publicationYear>
<dates>
<date dateType="Issued">1889-08-22</date>
</dates>
<resourceType resourceTypeGeneral="JournalArticle"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsIdenticalTo"
>10.1038/040391a0</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="https://creativecommons.org/publicdomain/zero/1.0/legalcode"
>Creative Commons Zero v1.0 Universal</rights>
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract">n/a</description>
</descriptions>
</resource>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>1889-08-22</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>http://creativecommons.org/publicdomain/zero/1.0/legalcode</oaf:license>
<oaf:language/>
<oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
<oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
</metadata>
</record>

View File

@ -185,7 +185,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -213,7 +212,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -241,7 +239,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -269,7 +266,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -297,7 +293,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -325,7 +320,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -353,7 +347,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -393,7 +386,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -422,7 +414,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -451,7 +442,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -480,7 +470,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -509,7 +498,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -538,7 +526,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -567,7 +554,6 @@
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-hist-snaps</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,223 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
#HADOOP_USER_NAME=$2
copydb $MONITOR_DB

View File

@ -1,41 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
SOURCE=$1
PRODUCTION=$2
SHADOW=$3
MONITOR_PROD=$4
MONITOR_IRISH_PROD=$5
echo ${SOURCE}
echo ${PRODUCTION}
#echo "Updating ${PRODUCTION} monitor database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Updating ${PRODUCTION} historical snapshots database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production monitor db ready!"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots as select * from ${SOURCE}.historical_snapshots"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots_fos as select * from ${SOURCE}.historical_snapshots_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish"
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish as select * from ${SOURCE}.historical_snapshots_irish"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos as select * from ${SOURCE}.historical_snapshots_irish"

View File

@ -1,27 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
#update Monitor DB IRISH
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -1,82 +0,0 @@
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_fos;
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
rf.lvl1,
rf.lvl2,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_fos rf on rf.id=r.id
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_fos purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_tmp;
drop table if exists ${monitor_db_name}.historical_snapshots_fos purge;
create table ${monitor_db_name}.historical_snapshots_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_fos;
drop table ${hist_db_name}.historical_snapshots_fos_tmp purge;
INSERT INTO ${hist_db_name}.historical_snapshots_tmp as
SELECT * FROM ${hist_db_name_prev}.historical_snapshots;
INSERT INTO ${hist_db_name}.historical_snapshots_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots purge;
CREATE TABLE ${hist_db_name}.historical_snapshots STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_tmp;
drop table if exists ${monitor_db_name}.historical_snapshots purge;
create table ${monitor_db_name}.historical_snapshots stored as parquet
as select * from ${hist_db_name}.historical_snapshots;
drop table ${hist_db_name}.historical_snapshots_tmp purge;

View File

@ -1,91 +0,0 @@
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish_fos;
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
rf.lvl1,
rf.lvl2,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_irish_db_name}.result r
left outer join ${stats_irish_db_name}.result_fos rf on rf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_irish_fos purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_fos STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_irish_tmp;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish;
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
select
cast(${hist_date} as STRING),
count(distinct r.id),
r.type,
pf.publicly_funded,
r.access_mode,
r.gold,
r.green,
coalesce(gl.green_with_license,0),
h.is_hybrid,
b.is_bronze_oa,
d.in_diamond_journal,
t.is_transformative,
pr.refereed
from ${stats_irish_db_name}.result r
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
drop table if exists ${hist_db_name}.historical_snapshots_irish purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish STORED AS PARQUET AS
SELECT * FROM ${hist_db_name}.historical_snapshots_irish_tmp;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish;
drop table ${hist_db_name}.historical_snapshots_irish_tmp purge;
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;

View File

@ -1,92 +0,0 @@
--------------------------------------------------------------
--------------------------------------------------------------
-- Historical Snapshots database creation
--------------------------------------------------------------
--------------------------------------------------------------
DROP database IF EXISTS ${hist_db_name} CASCADE;
CREATE database ${hist_db_name};
drop table if exists ${hist_db_name}.historical_snapshots_fos_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_tmp
(
hist_date STRING,
total INT,
type STRING,
lvl1 STRING,
lvl2 STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_irish_tmp
(
hist_date STRING,
total INT,
type STRING,
lvl1 STRING,
lvl2 STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_tmp
(
hist_date STRING,
total INT,
type STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
drop table if exists ${hist_db_name}.historical_snapshots_irish_tmp purge;
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_tmp
(
hist_date STRING,
total INT,
type STRING,
publicly_funded INT,
accessrights STRING,
gold INT,
green INT,
green_with_license INT,
hybrid INT,
bronze INT,
diamond INT,
transformative INT,
peer_reviewed STRING
)
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');

View File

@ -1,159 +0,0 @@
<workflow-app name="Stats Hist Snapshots" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>hist_db_name</name>
<description>the target hist database name</description>
</property>
<property>
<name>hist_db_name_prev</name>
<description>the hist database name of previous_month</description>
</property>
<property>
<name>stats_db_name</name>
<description>the stats db name</description>
</property>
<property>
<name>stats_irish_db_name</name>
<description>the stats irish db name</description>
</property>
<property>
<name>monitor_db_name</name>
<description>the monitor db name</description>
</property>
<property>
<name>monitor_irish_db_name</name>
<description>the irish monitor db name</description>
</property>
<property>
<name>hist_db_prod_name</name>
<description>the production db</description>
</property>
<property>
<name>hist_db_shadow_name</name>
<description>the production shadow db</description>
</property>
<property>
<name>hist_date</name>
<description>the snaps date</description>
</property>
<property>
<name>hive_metastore_uris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hive_jdbc_url</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="CreateDB">${wf:conf('resumeFrom') eq 'CreateDB'}</case>
<case to="BuildHistSnaps">${wf:conf('resumeFrom') eq 'BuildHistSnaps'}</case>
<case to="BuildHistSnapsIrish">${wf:conf('resumeFrom') eq 'BuildHistSnapsIrish'}</case>
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
<default to="BuildHistSnaps"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="CreateDB">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/CreateDB.sql</script>
<param>hist_db_name=${hist_db_name}</param>
</hive2>
<ok to="BuildHistSnaps"/>
<error to="Kill"/>
</action>
<action name="BuildHistSnaps">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/BuildHistSnapsAll.sql</script>
<param>hist_db_name=${hist_db_name}</param>
<param>hist_db_name_prev=${hist_db_name_prev}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>monitor_db_name=${monitor_db_name}</param>
<param>hist_date=${hist_date}</param>
</hive2>
<ok to="BuildHistSnapsIrish"/>
<error to="Kill"/>
</action>
<action name="BuildHistSnapsIrish">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/BuildHistSnapsIrish.sql</script>
<param>hist_db_name=${hist_db_name}</param>
<param>hist_db_name_prev=${hist_db_name_prev}</param>
<param>stats_irish_db_name=${stats_irish_db_name}</param>
<param>monitor_irish_db_name=${monitor_irish_db_name}</param>
<param>hist_date=${hist_date}</param>
</hive2>
<ok to="Step2-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step2-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${hist_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="Step3-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step3-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${hist_db_name}</argument>
<argument>${hist_db_prod_name}</argument>
<argument>${hist_db_shadow_name}</argument>
<argument>${monitor_db_prod_name}</argument>
<argument>${monitor_irish_db_prod_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-monitor-irish</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,222 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
#HADOOP_USER_NAME=$2
copydb $MONITOR_DB

View File

@ -1,23 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
SOURCE=$1
PRODUCTION=$2
echo ${SOURCE}
echo ${PRODUCTION}
#echo "Updating ${PRODUCTION} monitor database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
echo "Updating ${PRODUCTION} monitor database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production monitor db ready!"

View File

@ -1,28 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
export GRAPHDB=$5
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
#update Monitor DB IRISH
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$5/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"

View File

@ -1,241 +0,0 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
drop table if exists TARGET.irish_funders;
create TEMPORARY table TARGET.irish_funders as
select distinct xpath_string(fundingtree[0].value, '//funder/name') as funder from GRAPHDB.project
where xpath_string(fundingtree[0].value, '//funder/jurisdiction')='IE';
--create TEMPORARY table TARGET.irish_funders as
--select distinct name as funder from SOURCE.fundref where country='IE';
drop table if exists TARGET.result;
create table TARGET.result stored as parquet as
select distinct * from (
select r.*
from SOURCE.result r
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
union all
select r.*
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization and o.country='IE'
union all
select r.*
from SOURCE.result r
join SOURCE.result_pids pid on pid.id=r.id
join stats_ext.transformative_facts tf on tf.doi=pid.pid
) foo;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
create table TARGET.funder stored as parquet as select * from SOURCE.funder where country='IE';
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
-- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
-- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization;
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
-- projects
create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
-- indicators
-- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
-- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s;
create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);

View File

@ -1,118 +0,0 @@
<workflow-app name="Irish Monitor Update" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>
<description>the target stats database name</description>
</property>
<property>
<name>graph_db_name</name>
<description>the graph database name</description>
</property>
<property>
<name>monitor_irish_db_name</name>
<description>the target monitor db name</description>
</property>
<property>
<name>monitor_irish_db_prod_name</name>
<description>the name of the production monitor db</description>
</property>
<property>
<name>monitor_irish_db_shadow_name</name>
<description>the name of the shadow monitor db</description>
</property>
<property>
<name>hive_metastore_uris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hive_jdbc_url</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1-buildIrishMonitorDB">${wf:conf('resumeFrom') eq 'Step1-buildIrishMonitorDB'}</case>
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
<default to="Step1-buildIrishMonitorDB"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="Step1-buildIrishMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>monitor_irish.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${monitor_irish_db_name}</argument>
<argument>${monitor_irish_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/buildIrishMonitorDB.sql</argument>
<argument>${graph_db_name}</argument>
<file>monitor_irish.sh</file>
</shell>
<ok to="Step2-copyDataToImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step2-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${monitor_irish_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="Step3-finalizeImpalaCluster"/>
<error to="Kill"/>
</action>
<action name="Step3-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${monitor_irish_db_name}</argument>
<argument>${monitor_irish_db_prod_name}</argument>
<argument>${monitor_irish_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-monitor-update</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<version>2.1.11</version>
<configuration>
<failOnNoGitDirectory>false</failOnNoGitDirectory>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
</property>
</configuration>

View File

@ -1,223 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$2
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
}
MONITOR_DB=$1
copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB

View File

@ -1,57 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
#
#function createShadowDB() {
# SOURCE=$1
# SHADOW=$2
#
# # drop views from db
# for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited -q "show tables"`;
# do
# `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "drop view $i;"`;
# done
#
# impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
# impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
## impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
#}
#
#MONITOR_DB=$1
#MONITOR_DB_SHADOW=$2
#
#createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
#createShadowDB $MONITOR_DB $MONITOR_DB'_shadow'
SOURCE=$1
PRODUCTION=$2
echo ${SOURCE}
echo ${PRODUCTION}
#echo "Updating ${PRODUCTION} monitor database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}"
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
#
#echo "Updating ${PRODUCTION}_institutions database old cluster"
#impala-shell -q "create database if not exists ${PRODUCTION}_institutions"
#impala-shell -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -c -f -
#impala-shell -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -c -f -
#echo "Production insitutions db ready!"
echo "Updating ${PRODUCTION} monitor database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production monitor db ready!"
echo "Updating ${PRODUCTION}_institutions database"
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}_institutions"
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
echo "Production insitutions db ready!"

View File

@ -1,60 +0,0 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
export SCRIPT_PATH2=$5
export SCRIPT_PATH2=$6
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
export HADOOP_USER_NAME="oozie"
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
echo "Getting file from " $5
hdfs dfs -copyToLocal $5
echo "Getting file from " $6
hdfs dfs -copyToLocal $6
#update Monitor DB
cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
hive $HIVE_OPTS -f foo
#update Institutions DB
cat updateMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
cat updateMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
hive $HIVE_OPTS -f foo
echo "Hive shell finished"
#echo "Updating shadow monitor insitutions database"
#hive -e "drop database if exists ${SHADOW}_institutions cascade"
#hive -e "create database if not exists ${SHADOW}_institutions"
#hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
#hive -f foo
#echo "Shadow db monitor insitutions ready!"
#
##update Monitor DB
#cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
#hive $HIVE_OPTS -f foo
#
#echo "Hive shell finished"
#
#echo "Updating shadow monitor database"
#hive -e "drop database if exists ${SHADOW} cascade"
#hive -e "create database if not exists ${SHADOW}"
#hive $HIVE_OPTS --database ${2} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${2}.\1;/" > foo
#hive -f foo
#echo "Shadow db monitor insitutions ready!"

View File

@ -1,278 +0,0 @@
--drop database if exists TARGET cascade;
--create database if not exists TARGET;
--
--create view if not exists TARGET.category as select * from SOURCE.category;
--create view if not exists TARGET.concept as select * from SOURCE.concept;
--create view if not exists TARGET.context as select * from SOURCE.context;
--create view if not exists TARGET.country as select * from SOURCE.country;
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--
--create table TARGET.result stored as parquet as
-- select distinct * from (
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
-- ) )) foo;
--
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
-- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
-- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization;
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
-- projects
create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
-- indicators
-- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);

View File

@ -1,297 +0,0 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique
'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology
'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
'openorgs____::ab4ac74c35fa5dada770cf08e5110fab', -- Universidade Católica Portuguesa
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context;
create view if not exists TARGET.country as select * from SOURCE.country;
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
create view if not exists TARGET.funder as select * from SOURCE.funder;
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
-- datasources
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
-- organizations
create view if not exists TARGET.organization as select * from SOURCE.organization;
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
-- projects
create view if not exists TARGET.project as select * from SOURCE.project;
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
-- indicators
-- Sprint 1 ----
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
-- Sprint 2 ----
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
---- Sprint 3 ----
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
---- Sprint 4 ----
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
---- Sprint 5 ----
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
---- Sprint 6 ----
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
---- Sprint 7 ----
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
--create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);
--create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
--create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
--create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
--create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
--create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
--create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;

View File

@ -1,67 +0,0 @@
drop database if exists TARGET cascade;
create database if not exists TARGET;
create table TARGET.result stored as parquet as
select distinct * from (
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique
'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology
'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam
'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa
'openorgs____::ab4ac74c35fa5dada770cf08e5110fab', -- Universidade Católica Portuguesa
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -1,111 +0,0 @@
<workflow-app name="Stats Monitor Update" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>
<description>the target stats database name</description>
</property>
<property>
<name>monitor_db_name</name>
<description>the target monitor db name</description>
</property>
<property>
<name>monitor_db_shadow_name</name>
<description>the name of the shadow monitor db</description>
</property>
<property>
<name>hive_metastore_uris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>hive_jdbc_url</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hive_timeout</name>
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
</property>
<property>
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="Step1-updateMonitorDB">${wf:conf('resumeFrom') eq 'Step1-updateMonitorDB'}</case>
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
<default to="Step1-updateMonitorDB"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="Step1-updateMonitorDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>monitor.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/updateMonitorDB_institutions.sql</argument>
<argument>${wf:appPath()}/scripts/updateMonitorDB.sql</argument>
<argument>${wf:appPath()}/scripts/updateMonitorDBAll.sql</argument>
<file>monitor.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="Step2-copyDataToImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<argument>${monitor_db_name}</argument>
<argument>${hadoop_user_name}</argument>
<file>copyDataToImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="Step3-finalizeImpalaCluster">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>finalizeImpalaCluster.sh</exec>
<argument>${monitor_db_name}</argument>
<argument>${monitor_db_prod_name}</argument>
<argument>${monitor_db_shadow_name}</argument>
<file>finalizeImpalaCluster.sh</file>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -8,11 +8,6 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-update</artifactId>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>

View File

@ -35,20 +35,12 @@ export HADOOP_USER="oozie"
export HADOOP_USER_NAME="oozie"
echo "Creating and populating impala tables"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv"
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv"
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv"
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
echo "Cleaning up"
rm concepts.csv

View File

@ -6,215 +6,68 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
# Set the active HDFS node of OCEAN and IMPALA cluster.
OCEAN_HDFS_NODE='hdfs://nameservice1'
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
IMPALA_HDFS_NODE=''
COUNTER=0
while [ $COUNTER -lt 3 ]; do
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
break
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
break
else
IMPALA_HDFS_NODE=''
sleep 1
fi
((COUNTER++))
done
if [ -z "$IMPALA_HDFS_NODE" ]; then
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
exit 1
fi
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
# Set sed arguments.
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
# Set the SED command arguments for column-names with reserved words:
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
export HADOOP_USER_NAME=$6
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
function copydb() {
db=$1
echo -e "\nStart processing db: '${db}'..\n"
FILE=("hive_wf_tmp_"$RANDOM)
hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
# copy the databases from ocean to impala
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log
return 1
fi
echo "copying $db"
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
# change ownership to impala
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb
# Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \
-copybuffersize 1048576 \
-strategy dynamic \
-pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
# Check the exit status of the "hadoop distcp" command.
if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n"
else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
rm -f error.log
return 2
fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
echo -e "\nCreating schema for db: '${db}'\n"
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=()
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
if [ -n "$create_view_statement_test" ]; then
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
all_create_view_statements+=("$create_view_statement")
else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
fi
fi
fi
done
echo -e "\nAll tables have been created, going to create the views..\n"
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else
echo -e "\nDB '${db}' does not contain any views.\n"
fi
level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again.
should_retry_create_view_statements=()
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement")
else
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi
# drop tables from db
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
done
new_num_of_views_to_retry=${#should_retry_create_view_statements}
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
# drop views from db
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
done
# delete the database
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
# create the databases
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
echo "creating schema for ${db}"
for (( k = 0; k < 5; k ++ )); do
for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
do
impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
#
# # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
# do
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi
done
# load the data from /tmp in the respective tables
echo "copying data in tables and computing stats"
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
do
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
done
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n"
else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log
return 4
fi
rm -f error.log
echo -e "\n\nFinished processing db: ${db}\n\n"
# deleting the remaining directory from hdfs
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
}
STATS_DB=$1

View File

@ -85,12 +85,12 @@ hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/
hive -f foo
echo "Updated shadow monitor funded database"
echo "Updating shadow monitor institutions database"
echo "Updating shadow monitor insitutions database"
hive -e "drop database if exists ${SHADOW}_institutions cascade"
hive -e "create database if not exists ${SHADOW}_institutions"
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
hive -f foo
echo "Shadow db monitor institutions ready!"
echo "Shadow db monitor insitutions ready!"
echo "Updating shadow monitor RIs database"
for i in $contexts

View File

@ -69,7 +69,7 @@ SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
FROM ${openaire_db_name}.result res

View File

@ -7,76 +7,32 @@
------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct *
from (
select peer_reviewed.* from peer_reviewed
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct *
from (
select peer_reviewed.* from peer_reviewed
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct *
from (
select peer_reviewed.* from peer_reviewed
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct *
from (
select peer_reviewed.* from peer_reviewed
union all
select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr;
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed
@ -104,4 +60,4 @@ rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;

View File

@ -50,13 +50,13 @@ select distinct r.*
from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id;
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select distinct r.id, r.amount, r.currency
select r.id, r.amount, r.currency
from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r

View File

@ -180,12 +180,4 @@ create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_opene
create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;

View File

@ -80,12 +80,8 @@ create table TARGET.result stored as parquet as
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna
) )) foo;
create view if not exists TARGET.category as select * from SOURCE.category;
@ -268,11 +264,4 @@ create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);

View File

@ -60,10 +60,5 @@ create table TARGET.result stored as parquet as
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna
))) foo;

View File

@ -2,8 +2,9 @@ create table ${observatory_db_name}.result_cc_licence stored as parquet as
select r.id, coalesce(rln.count, 0) > 0 as cc_licence
from ${stats_db_name}.result r
left outer join (
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count
from ${stats_db_name}.result_licenses rl
left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license
group by rl.id
) rln on rln.id=r.id;

View File

@ -95,8 +95,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.funder purge;
create table ${stats_db_name}.funder STORED AS PARQUET as
select distinct xpath_string(fund, '//funder/id') as id,
xpath_string(fund, '//funder/name') as name,
xpath_string(fund, '//funder/shortname') as shortname,
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
xpath_string(fund, '//funder/shortname') as shortname
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge;

View File

@ -64,26 +64,6 @@
<name>hadoop_user_name</name>
<description>user name of the wf owner</description>
</property>
<property>
<name>sparkSqlWarehouseDir</name>
</property>
<!-- General oozie workflow properties -->
<property>
<name>sparkClusterOpts</name>
<value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
<description>spark cluster-wide options</description>
</property>
<property>
<name>sparkResourceOpts</name>
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
<description>spark resource options</description>
</property>
<property>
<name>sparkApplicationOpts</name>
<value>--conf spark.sql.shuffle.partitions=3840</value>
<description>spark resource options</description>
</property>
</parameters>
<global>
@ -95,21 +75,17 @@
<value>${hive_metastore_uris}</value>
</property>
<property>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
<name>hive.txn.timeout</name>
<value>${hive_timeout}</value>
</property>
<property>
<name>hive.mapjoin.followby.gby.localtask.max.memory.usage</name>
<value>0.80</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
<property>
<name>mapred.job.queue.name</name>
<value>analytics</value>
</property>
</configuration>
</global>
@ -157,164 +133,164 @@
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step1.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step2"/>
<error to="Kill"/>
</action>
<action name="Step2">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step2.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step3"/>
<error to="Kill"/>
</action>
<action name="Step3">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step3.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step4"/>
<error to="Kill"/>
</action>
<action name="Step4">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step4.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step5"/>
<error to="Kill"/>
</action>
<action name="Step5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step5.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step6"/>
<error to="Kill"/>
</action>
<action name="Step6">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step6.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step7"/>
<error to="Kill"/>
</action>
<action name="Step7">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step7.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step8"/>
<error to="Kill"/>
</action>
<action name="Step8">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step8.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step9"/>
<error to="Kill"/>
</action>
<action name="Step9">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step9.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step10"/>
<error to="Kill"/>
</action>
<action name="Step10">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step10.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Step11"/>
<error to="Kill"/>
</action>
</action>
<action name="Step11">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step11.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Step12"/>
<error to="Kill"/>
</action>
</action>
<action name="Step12">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step12.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step13"/>
<error to="Kill"/>
</action>
<action name="Step13">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step13.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step14"/>
<error to="Kill"/>
</action>
<action name="Step14">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step14.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step15"/>
<error to="Kill"/>
</action>
<action name="Step15">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step15.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
<param>stats_db_name=${stats_db_name}</param>
<param>openaire_db_name=${openaire_db_name}</param>
</hive2>
<ok to="Step15_5"/>
<error to="Kill"/>
@ -345,35 +321,13 @@
<error to="Kill"/>
</action>
<!-- <action name="Step16-createIndicatorsTables">-->
<!-- <hive2 xmlns="uri:oozie:hive2-action:0.1">-->
<!-- <jdbc-url>${hive_jdbc_url}</jdbc-url>-->
<!-- <script>scripts/step16-createIndicatorsTables.sql</script>-->
<!-- <param>stats_db_name=${stats_db_name}</param>-->
<!-- <param>external_stats_db_name=${external_stats_db_name}</param>-->
<!-- </hive2>-->
<!-- <ok to="Step16_1-definitions"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="Step16-createIndicatorsTables">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Step16-createIndicatorsTables</name>
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
<jar>dhp-stats-update-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
${sparkClusterOpts}
${sparkResourceOpts}
${sparkApplicationOpts}
</spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
</spark>
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<jdbc-url>${hive_jdbc_url}</jdbc-url>
<script>scripts/step16-createIndicatorsTables.sql</script>
<param>stats_db_name=${stats_db_name}</param>
<param>external_stats_db_name=${external_stats_db_name}</param>
</hive2>
<ok to="Step16_1-definitions"/>
<error to="Kill"/>
</action>
@ -433,18 +387,18 @@
<error to="Kill"/>
</action>
<!-- <action name="step20-createMonitorDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>monitor-post.sh</exec>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <file>monitor-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<!-- <action name="step20-createMonitorDB-post">-->
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <exec>monitor-post.sh</exec>-->
<!-- <argument>${monitor_db_name}</argument>-->
<!-- <argument>${monitor_db_shadow_name}</argument>-->
<!-- <file>monitor-post.sh</file>-->
<!-- </shell>-->
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="step21-createObservatoryDB-pre">
<shell xmlns="uri:oozie:shell-action:0.1">
@ -489,8 +443,8 @@
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>copyDataToImpalaCluster.sh</exec>
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
<!-- <argument>${external_stats_db_name}</argument>-->
<argument>${stats_db_name}</argument>
<argument>${monitor_db_name}</argument>
<argument>${observatory_db_name}</argument>
@ -551,4 +505,4 @@
</action>
<end name="End"/>
</workflow-app>
</workflow-app>

View File

@ -31,10 +31,6 @@
<module>dhp-enrichment</module>
<module>dhp-graph-provision</module>
<module>dhp-blacklist</module>
<module>dhp-stats-actionsets</module>
<module>dhp-stats-hist-snaps</module>
<module>dhp-stats-monitor-irish</module>
<module>dhp-stats-monitor-update</module>
<module>dhp-stats-update</module>
<module>dhp-stats-promote</module>
<module>dhp-usage-stats-build</module>