Compare commits
46 Commits
master
...
base_stats
Author | SHA1 | Date |
---|---|---|
Michele Artini | 6e380993d6 | |
Michele Artini | f9416ab2aa | |
Michele Artini | 3e56b88a5f | |
Michele Artini | dfb05ebedb | |
Michele Artini | 341c3f798e | |
Michele Artini | 4145065481 | |
Michele Artini | 5e9102d404 | |
Michele Artini | dc9d642e66 | |
Michele Artini | 932173287a | |
Michele Artini | 3b5163d8e3 | |
Michele Artini | 6648d710a3 | |
Michele Artini | a059747f16 | |
Michele Artini | 1e34585213 | |
Michele Artini | 108478b778 | |
Michele Artini | 6500151c90 | |
Michele Artini | af58cd726e | |
Michele Artini | efbb6c37d6 | |
Michele Artini | b206e9a30b | |
Michele Artini | db6f774394 | |
Michele Artini | 9506d80ddc | |
Michele Artini | c2b6841eb0 | |
Michele Artini | be7f327e88 | |
Michele Artini | 32f4d6f691 | |
Michele Artini | 71204a8056 | |
Michele Artini | 5ddbef3a5b | |
Michele Artini | 04dd31139b | |
Michele Artini | 3d14bef381 | |
Michele Artini | f8cf7ffbcb | |
Michele Artini | d2b7541583 | |
Michele Artini | 8ffdd9747d | |
Michele Artini | da65728afe | |
Michele Artini | e254720377 | |
Michele Artini | 8d85c1e97e | |
Michele Artini | b42e2b4d61 | |
Michele Artini | 773346f638 | |
Michele Artini | 2e11197142 | |
Michele Artini | ddd6a7ceb3 | |
Michele Artini | 963a2500be | |
Michele Artini | 4b1ecad4e2 | |
Michele Artini | dd7350ecf2 | |
Michele Artini | 265bfd364d | |
Michele Artini | 16766c514e | |
Michele Artini | 5add433b74 | |
Michele Artini | c974c75f83 | |
Michele Artini | c6db6335b9 | |
Michele Artini | abcd81bba0 |
|
@ -312,8 +312,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
if (value instanceof Datasource) {
|
||||
final Datasource d = (Datasource) value;
|
||||
return Objects.nonNull(d.getOfficialname()) && StringUtils.isNotBlank(d.getOfficialname().getValue());
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
final Project p = (Project) value;
|
||||
return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue());
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2024.
|
||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.promote;
|
||||
|
||||
/** Encodes the Actionset promotion strategies */
|
||||
public class PromoteAction {
|
||||
|
||||
/** The supported actionset promotion strategies
|
||||
*
|
||||
* ENRICH: promotes only records in the actionset matching another record in the
|
||||
* graph and enriches them applying the given MergeAndGet strategy
|
||||
* UPSERT: promotes all the records in an actionset, matching records are updated
|
||||
* using the given MergeAndGet strategy, the non-matching record as inserted as they are.
|
||||
*/
|
||||
public enum Strategy {
|
||||
ENRICH, UPSERT
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string representation of the join type implementing the given PromoteAction.
|
||||
*
|
||||
* @param strategy the strategy to be used to promote the Actionset contents
|
||||
* @return the join type used to implement the promotion strategy
|
||||
*/
|
||||
public static String joinTypeForStrategy(PromoteAction.Strategy strategy) {
|
||||
switch (strategy) {
|
||||
case ENRICH:
|
||||
return "left_outer";
|
||||
case UPSERT:
|
||||
return "full_outer";
|
||||
default:
|
||||
throw new IllegalStateException("unsupported PromoteAction: " + strategy.toString());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -67,9 +67,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
String outputGraphTablePath = parser.get("outputGraphTablePath");
|
||||
logger.info("outputGraphTablePath: {}", outputGraphTablePath);
|
||||
|
||||
MergeAndGet.Strategy mergeAndGetStrategy = MergeAndGet.Strategy
|
||||
.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
|
||||
logger.info("mergeAndGetStrategy: {}", mergeAndGetStrategy);
|
||||
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
|
||||
logger.info("strategy: {}", strategy);
|
||||
|
||||
Boolean shouldGroupById = Optional
|
||||
.ofNullable(parser.get("shouldGroupById"))
|
||||
|
@ -77,12 +76,6 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
.orElse(true);
|
||||
logger.info("shouldGroupById: {}", shouldGroupById);
|
||||
|
||||
PromoteAction.Strategy promoteActionStrategy = Optional
|
||||
.ofNullable(parser.get("promoteActionStrategy"))
|
||||
.map(PromoteAction.Strategy::valueOf)
|
||||
.orElse(PromoteAction.Strategy.UPSERT);
|
||||
logger.info("promoteActionStrategy: {}", promoteActionStrategy);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -104,8 +97,7 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
inputGraphTablePath,
|
||||
inputActionPayloadPath,
|
||||
outputGraphTablePath,
|
||||
mergeAndGetStrategy,
|
||||
promoteActionStrategy,
|
||||
strategy,
|
||||
rowClazz,
|
||||
actionPayloadClazz,
|
||||
shouldGroupById);
|
||||
|
@ -132,16 +124,14 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
String inputGraphTablePath,
|
||||
String inputActionPayloadPath,
|
||||
String outputGraphTablePath,
|
||||
MergeAndGet.Strategy mergeAndGetStrategy,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
MergeAndGet.Strategy strategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
|
||||
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
|
||||
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
|
||||
|
||||
Dataset<G> result = promoteActionPayloadForGraphTable(
|
||||
rowDS, actionPayloadDS, mergeAndGetStrategy, promoteActionStrategy, rowClazz, actionPayloadClazz,
|
||||
shouldGroupById)
|
||||
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
|
||||
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
|
||||
|
||||
saveGraphTable(result, outputGraphTablePath);
|
||||
|
@ -193,8 +183,7 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
|
||||
Dataset<G> rowDS,
|
||||
Dataset<A> actionPayloadDS,
|
||||
MergeAndGet.Strategy mergeAndGetStrategy,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
MergeAndGet.Strategy strategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz,
|
||||
Boolean shouldGroupById) {
|
||||
|
@ -206,9 +195,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
|
||||
SerializableSupplier<Function<G, String>> rowIdFn = ModelSupport::idFn;
|
||||
SerializableSupplier<Function<A, String>> actionPayloadIdFn = ModelSupport::idFn;
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet
|
||||
.functionFor(mergeAndGetStrategy);
|
||||
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(mergeAndGetStrategy);
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
|
||||
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
|
||||
|
||||
|
@ -219,7 +207,6 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
rowIdFn,
|
||||
actionPayloadIdFn,
|
||||
mergeRowWithActionPayloadAndGetFn,
|
||||
promoteActionStrategy,
|
||||
rowClazz,
|
||||
actionPayloadClazz);
|
||||
|
||||
|
|
|
@ -34,7 +34,6 @@ public class PromoteActionPayloadFunctions {
|
|||
* @param rowIdFn Function used to get the id of graph table row
|
||||
* @param actionPayloadIdFn Function used to get id of action payload instance
|
||||
* @param mergeAndGetFn Function used to merge graph table row and action payload instance
|
||||
* @param promoteActionStrategy the Actionset promotion strategy
|
||||
* @param rowClazz Class of graph table
|
||||
* @param actionPayloadClazz Class of action payload
|
||||
* @param <G> Type of graph table row
|
||||
|
@ -47,7 +46,6 @@ public class PromoteActionPayloadFunctions {
|
|||
SerializableSupplier<Function<G, String>> rowIdFn,
|
||||
SerializableSupplier<Function<A, String>> actionPayloadIdFn,
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeAndGetFn,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz) {
|
||||
if (!isSubClass(rowClazz, actionPayloadClazz)) {
|
||||
|
@ -63,7 +61,7 @@ public class PromoteActionPayloadFunctions {
|
|||
.joinWith(
|
||||
actionPayloadWithIdDS,
|
||||
rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")),
|
||||
PromoteAction.joinTypeForStrategy(promoteActionStrategy))
|
||||
"full_outer")
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, G>, Tuple2<String, A>>, G>) value -> {
|
||||
Optional<G> rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2);
|
||||
|
|
|
@ -41,12 +41,6 @@
|
|||
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pas",
|
||||
"paramLongName": "promoteActionStrategy",
|
||||
"paramDescription": "strategy for promoting the actionset contents into the graph tables, ENRICH or UPSERT (default)",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "sgid",
|
||||
"paramLongName": "shouldGroupById",
|
||||
|
|
|
@ -115,7 +115,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
|
||||
|
@ -168,7 +167,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -106,7 +106,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/datasource</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -106,7 +106,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/organization</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -114,7 +114,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
|
||||
|
@ -167,7 +166,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -106,7 +106,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/project</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -115,7 +115,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
|
||||
|
@ -168,7 +167,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -107,7 +107,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/relation</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -114,7 +114,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
|
||||
|
@ -167,7 +166,6 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -54,7 +54,7 @@ public class PromoteActionPayloadFunctionsTest {
|
|||
RuntimeException.class,
|
||||
() -> PromoteActionPayloadFunctions
|
||||
.joinGraphTableWithActionPayloadAndMerge(
|
||||
null, null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
|
||||
null, null, null, null, null, OafImplSubSub.class, OafImpl.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -104,7 +104,6 @@ public class PromoteActionPayloadFunctionsTest {
|
|||
rowIdFn,
|
||||
actionPayloadIdFn,
|
||||
mergeAndGetFn,
|
||||
PromoteAction.Strategy.UPSERT,
|
||||
OafImplSubSub.class,
|
||||
OafImplSubSub.class)
|
||||
.collectAsList();
|
||||
|
@ -184,7 +183,6 @@ public class PromoteActionPayloadFunctionsTest {
|
|||
rowIdFn,
|
||||
actionPayloadIdFn,
|
||||
mergeAndGetFn,
|
||||
PromoteAction.Strategy.UPSERT,
|
||||
OafImplSubSub.class,
|
||||
OafImplSub.class)
|
||||
.collectAsList();
|
||||
|
|
|
@ -58,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
|
|||
|
||||
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
|
||||
|
||||
final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||
log.info("outputPath path is {}", outputPath);
|
||||
|
||||
final CollectorPlugin plugin = getCollectorPlugin();
|
||||
|
@ -68,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
|
|||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
fileSystem.getConf(),
|
||||
SequenceFile.Writer.file(new Path(outputPath)),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class),
|
||||
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer
|
||||
.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(api, report)
|
||||
.forEach(
|
||||
content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (Throwable e) {
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
.collect(this.api, this.report)
|
||||
.forEach(content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (final Throwable e) {
|
||||
this.report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e);
|
||||
} finally {
|
||||
shutdown();
|
||||
report.ongoing(counter.longValue(), counter.longValue());
|
||||
this.report.ongoing(counter.longValue(), counter.longValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void scheduleReport(AtomicInteger counter) {
|
||||
private void scheduleReport(final AtomicInteger counter) {
|
||||
schedule(new ReporterCallback() {
|
||||
|
||||
@Override
|
||||
public Long getCurrent() {
|
||||
return counter.longValue();
|
||||
|
@ -112,33 +112,33 @@ public class CollectorWorker extends ReportingJob {
|
|||
|
||||
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
||||
|
||||
switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
|
||||
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) {
|
||||
case oai:
|
||||
return new OaiCollectorPlugin(clientParams);
|
||||
return new OaiCollectorPlugin(this.clientParams);
|
||||
case rest_json2xml:
|
||||
return new RestCollectorPlugin(clientParams);
|
||||
return new RestCollectorPlugin(this.clientParams);
|
||||
case file:
|
||||
return new FileCollectorPlugin(fileSystem);
|
||||
return new FileCollectorPlugin(this.fileSystem);
|
||||
case fileGzip:
|
||||
return new FileGZipCollectorPlugin(fileSystem);
|
||||
return new FileGZipCollectorPlugin(this.fileSystem);
|
||||
case baseDump:
|
||||
return new BaseCollectorPlugin(this.fileSystem);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
.ofNullable(api.getParams().get("other_plugin_type"))
|
||||
.ofNullable(this.api.getParams().get("other_plugin_type"))
|
||||
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
||||
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
|
||||
|
||||
switch (plugin) {
|
||||
case mdstore_mongodb_dump:
|
||||
return new MongoDbDumpCollectorPlugin(fileSystem);
|
||||
return new MongoDbDumpCollectorPlugin(this.fileSystem);
|
||||
case mdstore_mongodb:
|
||||
return new MDStoreCollectorPlugin();
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||
}
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
|
||||
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,379 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.base;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.ObjectUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.LongWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.DeflateCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class BaseAnalyzerJob {
|
||||
|
||||
private static final String BASE_DUMP = "BASE_DUMP";
|
||||
private static final Logger log = LoggerFactory.getLogger(BaseAnalyzerJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
BaseAnalyzerJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String dataPath = parser.get("dataPath");
|
||||
log.info("dataPath {}: ", dataPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final String opendoarPath = parser.get("opendoarPath");
|
||||
log.info("opendoarPath {}: ", opendoarPath);
|
||||
|
||||
final String typesReportPath = parser.get("typesReportPath");
|
||||
log.info("typesReportPath {}: ", typesReportPath);
|
||||
|
||||
final int fromStep = Integer.parseInt(parser.get("fromStep"));
|
||||
log.info("fromStep {}: ", fromStep);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
log.info("postgresUrl {}: ", dbUrl);
|
||||
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
log.info("postgresUser {}: ", dbUser);
|
||||
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
log.info("postgresPassword {}: ", dbPassword);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
if (fromStep <= 0) {
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 0: LoadRecords\n**************************************");
|
||||
loadRecords(inputPath, dataPath);
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 0: DONE\n**************************************");
|
||||
}
|
||||
|
||||
if (fromStep <= 1) {
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 1: Base Report\n**************************************");
|
||||
generateReport(spark, dataPath, outputPath);
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 1: DONE\n**************************************");
|
||||
}
|
||||
|
||||
if (fromStep <= 2) {
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 2: OpenDOAR Report\n**************************************");
|
||||
generateOpenDoarReport(spark, outputPath, opendoarPath, loadOpenDoarStats(dbUrl, dbUser, dbPassword));
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 2: DONE\n**************************************");
|
||||
}
|
||||
|
||||
if (fromStep <= 3) {
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 3: Type Vocabulary Report\n**************************************");
|
||||
generateVocTypeReport(spark, outputPath, typesReportPath);
|
||||
log
|
||||
.info(
|
||||
"\n**************************************\n* EXECUTING STEP 3: DONE\n**************************************");
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void generateVocTypeReport(final SparkSession spark,
|
||||
final String reportPath,
|
||||
final String typesReportPath) {
|
||||
spark
|
||||
.read()
|
||||
.parquet(reportPath)
|
||||
.as(Encoders.bean(BaseRecordInfo.class))
|
||||
.flatMap(rec -> {
|
||||
final List<Tuple2<String, String>> list = new ArrayList<>();
|
||||
for (final String t1 : rec.getTypes()) {
|
||||
if (t1.startsWith("TYPE_NORM:")) {
|
||||
for (final String t2 : rec.getTypes()) {
|
||||
if (t2.startsWith("TYPE:")) {
|
||||
list
|
||||
.add(
|
||||
new Tuple2<>(StringUtils.substringAfter(t1, "TYPE_NORM:").trim(),
|
||||
StringUtils.substringAfter(t2, "TYPE:").trim()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return list.iterator();
|
||||
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||
.distinct()
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(typesReportPath);
|
||||
|
||||
}
|
||||
|
||||
private static void generateOpenDoarReport(final SparkSession spark,
|
||||
final String reportPath,
|
||||
final String opendoarPath,
|
||||
final List<OpenDoarRepoStatus> repos) {
|
||||
|
||||
final Dataset<OpenDoarRepoStatus> fromDB = spark.createDataset(repos, Encoders.bean(OpenDoarRepoStatus.class));
|
||||
|
||||
final Dataset<OpenDoarRepoStatus> fromBASE = spark
|
||||
.read()
|
||||
.parquet(reportPath)
|
||||
.selectExpr("explode(collections) as collection")
|
||||
.where("isnotnull(collection.opendoarId) and character_length(collection.opendoarId)>0")
|
||||
.selectExpr("concat('opendoar____::',collection.opendoarId) as id")
|
||||
.groupBy(col("id"))
|
||||
.agg(count(col("id")))
|
||||
.map(row -> {
|
||||
final OpenDoarRepoStatus repo = new OpenDoarRepoStatus();
|
||||
repo.setId(row.getString(0));
|
||||
repo.getAggregations().put(BASE_DUMP, row.getLong(1));
|
||||
repo.setBaseCount(row.getLong(1));
|
||||
repo.setOpenaireCount(0);
|
||||
repo.setHighCompliance(false);
|
||||
return repo;
|
||||
}, Encoders.bean(OpenDoarRepoStatus.class));
|
||||
|
||||
fromDB
|
||||
.joinWith(fromBASE, fromDB.col("id").equalTo(fromBASE.col("id")), "full_outer")
|
||||
.map(t -> merge(t._1, t._2), Encoders.bean(OpenDoarRepoStatus.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(opendoarPath);
|
||||
}
|
||||
|
||||
private static OpenDoarRepoStatus merge(final OpenDoarRepoStatus r1, final OpenDoarRepoStatus r2) {
|
||||
if (r1 == null) {
|
||||
return r2;
|
||||
}
|
||||
if (r2 == null) {
|
||||
return r1;
|
||||
}
|
||||
|
||||
final OpenDoarRepoStatus r = new OpenDoarRepoStatus();
|
||||
r.setId(ObjectUtils.firstNonNull(r1.getId(), r2.getId()));
|
||||
r.setJurisdiction(ObjectUtils.firstNonNull(r1.getJurisdiction(), r2.getJurisdiction()));
|
||||
r.getAggregations().putAll(r1.getAggregations());
|
||||
r.getAggregations().putAll(r2.getAggregations());
|
||||
r.setHighCompliance(r1.isHighCompliance() || r2.isHighCompliance());
|
||||
r.setBaseCount(Math.max(r1.getBaseCount(), r2.getBaseCount()));
|
||||
r.setOpenaireCount(Math.max(r1.getOpenaireCount(), r2.getOpenaireCount()));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
private static List<OpenDoarRepoStatus> loadOpenDoarStats(final String dbUrl,
|
||||
final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
final List<OpenDoarRepoStatus> repos = new ArrayList<>();
|
||||
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
|
||||
final String sql = IOUtils
|
||||
.toString(
|
||||
BaseAnalyzerJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-aggregation-status.sql"));
|
||||
|
||||
dbClient.processResults(sql, row -> {
|
||||
try {
|
||||
final OpenDoarRepoStatus repo = new OpenDoarRepoStatus();
|
||||
repo.setId(row.getString("id"));
|
||||
repo.setJurisdiction(row.getString("jurisdiction"));
|
||||
repo.setBaseCount(0);
|
||||
repo.setHighCompliance(false);
|
||||
|
||||
long sum = 0;
|
||||
for (final String s : (String[]) row.getArray("aggregations").getArray()) {
|
||||
final String api = StringUtils.substringBefore(s, "@@@");
|
||||
final long count = NumberUtils.toLong(StringUtils.substringAfter(s, "@@@"), 0);
|
||||
sum += count;
|
||||
repo.getAggregations().put(api, count);
|
||||
// This should recognize the HIGH Compliances: openaire*X.Y*
|
||||
if (s.contains("compliance: openaire")) {
|
||||
repo.setHighCompliance(true);
|
||||
}
|
||||
}
|
||||
repo.setOpenaireCount(sum);
|
||||
|
||||
repos.add(repo);
|
||||
log.info("# FOUND OPENDOAR (DB): " + repo.getId());
|
||||
} catch (final SQLException e) {
|
||||
log.error("Error in SQL", e);
|
||||
throw new RuntimeException("Error in SQL", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
return repos;
|
||||
}
|
||||
|
||||
private static void loadRecords(final String inputPath, final String outputPath) throws Exception {
|
||||
try (final FileSystem fs = FileSystem.get(new Configuration());
|
||||
final AggregatorReport report = new AggregatorReport()) {
|
||||
|
||||
final AtomicLong recordsCounter = new AtomicLong(0);
|
||||
|
||||
final LongWritable key = new LongWritable();
|
||||
final Text value = new Text();
|
||||
|
||||
try (final SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(LongWritable.class),
|
||||
SequenceFile.Writer
|
||||
.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
|
||||
final BaseCollectorIterator iteraror = new BaseCollectorIterator(fs, new Path(inputPath), report);
|
||||
|
||||
while (iteraror.hasNext()) {
|
||||
final String record = iteraror.next();
|
||||
|
||||
final long i = recordsCounter.incrementAndGet();
|
||||
if ((i % 10000) == 0) {
|
||||
log.info("# Loaded records: " + i);
|
||||
}
|
||||
|
||||
key.set(i);
|
||||
value.set(record);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (final Throwable e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("# COMPLETED - Loaded records: " + recordsCounter.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void generateReport(final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String targetPath) throws Exception {
|
||||
|
||||
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
|
||||
.fromSparkContext(spark.sparkContext())
|
||||
.sequenceFile(inputPath, LongWritable.class, Text.class)
|
||||
.map(s -> s._2.toString())
|
||||
.map(BaseAnalyzerJob::extractInfo);
|
||||
|
||||
spark
|
||||
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(targetPath);
|
||||
}
|
||||
|
||||
protected static BaseRecordInfo extractInfo(final String s) {
|
||||
try {
|
||||
final Document record = DocumentHelper.parseText(s);
|
||||
|
||||
final BaseRecordInfo info = new BaseRecordInfo();
|
||||
|
||||
final Set<String> paths = new LinkedHashSet<>();
|
||||
final Set<String> types = new LinkedHashSet<>();
|
||||
final List<BaseCollectionInfo> colls = new ArrayList<>();
|
||||
|
||||
for (final Object o : record.selectNodes("//*|//@*")) {
|
||||
paths.add(((Node) o).getPath());
|
||||
|
||||
if (o instanceof Element) {
|
||||
final Element n = (Element) o;
|
||||
|
||||
final String nodeName = n.getName();
|
||||
|
||||
if ("collection".equals(nodeName)) {
|
||||
final String collName = n.getText().trim();
|
||||
|
||||
if (StringUtils.isNotBlank(collName)) {
|
||||
final BaseCollectionInfo coll = new BaseCollectionInfo();
|
||||
coll.setId(collName);
|
||||
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
|
||||
coll.setRorId(n.valueOf("@ror_id").trim());
|
||||
colls.add(coll);
|
||||
}
|
||||
} else if ("type".equals(nodeName)) {
|
||||
types.add("TYPE: " + n.getText().trim());
|
||||
} else if ("typenorm".equals(nodeName)) {
|
||||
types.add("TYPE_NORM: " + n.getText().trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
|
||||
info.getTypes().addAll(types);
|
||||
info.getPaths().addAll(paths);
|
||||
info.setCollections(colls);
|
||||
|
||||
return info;
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -45,22 +45,15 @@ public class BaseCollectorPlugin implements CollectorPlugin {
|
|||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||
// the path of the dump file on HDFS
|
||||
// http://oai.base-search.net/initial_load/base_oaipmh_dump-current.tar
|
||||
// it could be downloaded from iis-cdh5-test-gw.ocean.icm.edu.pl and then copied on HDFS
|
||||
// get path to file
|
||||
final Path filePath = Optional
|
||||
.ofNullable(api.getBaseUrl())
|
||||
.map(Path::new)
|
||||
.orElseThrow(() -> new CollectorException("missing baseUrl"));
|
||||
.ofNullable(api.getBaseUrl())
|
||||
.map(Path::new)
|
||||
.orElseThrow(() -> new CollectorException("missing baseUrl"));
|
||||
|
||||
// get the parameters for the connection to the OpenAIRE database.
|
||||
// the database is used to obtain the list of the datasources that the plugin will collect
|
||||
final String dbUrl = api.getParams().get("dbUrl");
|
||||
final String dbUser = api.getParams().get("dbUser");
|
||||
final String dbPassword = api.getParams().get("dbPassword");
|
||||
|
||||
// the types(comma separated, empty value for all) that the plugin will collect,
|
||||
// the types should be expressed in the format of the normalized types of BASE (for example 1,121,...)
|
||||
final String acceptedNormTypesString = api.getParams().get("acceptedNormTypes");
|
||||
|
||||
log.info("baseUrl: {}", filePath);
|
||||
|
@ -70,9 +63,7 @@ public class BaseCollectorPlugin implements CollectorPlugin {
|
|||
log.info("acceptedNormTypes: {}", acceptedNormTypesString);
|
||||
|
||||
try {
|
||||
if (!this.fs.exists(filePath)) {
|
||||
throw new CollectorException("path does not exist: " + filePath);
|
||||
}
|
||||
if (!this.fs.exists(filePath)) { throw new CollectorException("path does not exist: " + filePath); }
|
||||
} catch (final Throwable e) {
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
|
@ -91,19 +82,19 @@ public class BaseCollectorPlugin implements CollectorPlugin {
|
|||
final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
|
||||
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
|
||||
return StreamSupport
|
||||
.stream(spliterator, false)
|
||||
.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
|
||||
.stream(spliterator, false)
|
||||
.filter(doc -> filterXml(doc, acceptedOpendoarIds, acceptedNormTypes));
|
||||
}
|
||||
|
||||
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
|
||||
throws CollectorException {
|
||||
throws CollectorException {
|
||||
final Set<String> accepted = new HashSet<>();
|
||||
|
||||
try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
|
||||
final String sql = IOUtils
|
||||
.toString(
|
||||
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
|
||||
.toString(BaseAnalyzerJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
|
||||
|
||||
dbClient.processResults(sql, row -> {
|
||||
try {
|
||||
|
@ -127,26 +118,20 @@ public class BaseCollectorPlugin implements CollectorPlugin {
|
|||
}
|
||||
|
||||
protected static boolean filterXml(final String xml,
|
||||
final Set<String> acceptedOpendoarIds,
|
||||
final Set<String> acceptedNormTypes) {
|
||||
final Set<String> acceptedOpendoarIds,
|
||||
final Set<String> acceptedNormTypes) {
|
||||
try {
|
||||
|
||||
final Document doc = DocumentHelper.parseText(xml);
|
||||
|
||||
final String id = doc.valueOf("//*[local-name()='collection']/@opendoar_id").trim();
|
||||
|
||||
if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) {
|
||||
return false;
|
||||
}
|
||||
if (StringUtils.isBlank(id) || !acceptedOpendoarIds.contains("opendoar____::" + id)) { return false; }
|
||||
|
||||
if (acceptedNormTypes.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
if (acceptedNormTypes.isEmpty()) { return true; }
|
||||
|
||||
for (final Object s : doc.selectNodes("//*[local-name()='typenorm']")) {
|
||||
if (acceptedNormTypes.contains(((Node) s).getText().trim())) {
|
||||
return true;
|
||||
}
|
||||
if (acceptedNormTypes.contains(((Node) s).getText().trim())) { return true; }
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.base;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class OpenDoarRepoStatus implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 4832658700366871160L;
|
||||
|
||||
private String id;
|
||||
|
||||
private String jurisdiction;
|
||||
|
||||
private boolean highCompliance = false;
|
||||
|
||||
private long baseCount = 0;
|
||||
|
||||
private long openaireCount = 0;
|
||||
|
||||
private Map<String, Long> aggregations = new HashMap<>();
|
||||
|
||||
public String getId() {
|
||||
return this.id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getJurisdiction() {
|
||||
return this.jurisdiction;
|
||||
}
|
||||
|
||||
public void setJurisdiction(final String jurisdiction) {
|
||||
this.jurisdiction = jurisdiction;
|
||||
}
|
||||
|
||||
public Map<String, Long> getAggregations() {
|
||||
return this.aggregations;
|
||||
}
|
||||
|
||||
public void setAggregations(final Map<String, Long> aggregations) {
|
||||
this.aggregations = aggregations;
|
||||
}
|
||||
|
||||
public boolean isHighCompliance() {
|
||||
return this.highCompliance;
|
||||
}
|
||||
|
||||
public void setHighCompliance(final boolean highCompliance) {
|
||||
this.highCompliance = highCompliance;
|
||||
}
|
||||
|
||||
public long getOpenaireCount() {
|
||||
return this.openaireCount;
|
||||
}
|
||||
|
||||
public void setOpenaireCount(final long openaireCount) {
|
||||
this.openaireCount = openaireCount;
|
||||
}
|
||||
|
||||
public long getBaseCount() {
|
||||
return this.baseCount;
|
||||
}
|
||||
|
||||
public void setBaseCount(final long baseCount) {
|
||||
this.baseCount = baseCount;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
[
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path of the BASE dump",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "dataPath",
|
||||
"paramDescription": "the path of the loaded records",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the generated the report",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "od",
|
||||
"paramLongName": "opendoarPath",
|
||||
"paramDescription": "the path of the generated the OpenDOAR report",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "typesReportPath",
|
||||
"paramDescription": "the path of the generated the types report",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "fromStep",
|
||||
"paramDescription": "the initial step (numeric, 0 for ALL STEPS)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>10G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,79 @@
|
|||
<workflow-app name="Analyze_BASE_Records" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>baseInputPath</name>
|
||||
<description>the path of the BASE dump</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseDataPath</name>
|
||||
<description>the path where to store BASE records</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseReportsPath</name>
|
||||
<description>path where to store the reports</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseOpenDoarReportsPath</name>
|
||||
<description>path where to store the OpenDOAR reports</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseTypesReportPath</name>
|
||||
<description>path of the generated the types report</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>baseFromStep</name>
|
||||
<description>the initial step (numeric, 0 for ALL STEPS)</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="analyzeBaseRecords"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="analyzeBaseRecords">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>AnalyzeBaseRecords</name>
|
||||
<class>eu.dnetlib.dhp.collection.plugin.base.BaseAnalyzerJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${baseInputPath}</arg>
|
||||
<arg>--dataPath</arg><arg>${baseDataPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${baseReportsPath}</arg>
|
||||
<arg>--opendoarPath</arg><arg>${baseOpenDoarReportsPath}</arg>
|
||||
<arg>--typesReportPath</arg><arg>${baseTypesReportPath}</arg>
|
||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
<arg>--fromStep</arg><arg>${baseFromStep}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -99,16 +99,4 @@ INSERT INTO dsm_apiparams(
|
|||
'***'
|
||||
);
|
||||
|
||||
INSERT INTO dsm_apiparams(
|
||||
_dnet_resource_identifier_,
|
||||
api,
|
||||
param,
|
||||
value
|
||||
) VALUES (
|
||||
'api_________::openaire____::base_search::dump@@acceptedNormTypes',
|
||||
'api_________::openaire____::base_search::dump',
|
||||
'acceptedNormTypes',
|
||||
'1,11,111,121,13,14,15,18,181,182,183,1A,6,7'
|
||||
);
|
||||
|
||||
COMMIT;
|
|
@ -2,8 +2,6 @@ select s.id as id
|
|||
from dsm_services s
|
||||
where collectedfrom = 'openaire____::opendoar'
|
||||
and jurisdiction = 'Institutional'
|
||||
and s.id in (
|
||||
select service from dsm_api where coalesce(compatibility_override, compatibility) = 'driver' or coalesce(compatibility_override, compatibility) = 'UNKNOWN'
|
||||
) and s.id not in (
|
||||
select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%'
|
||||
);
|
||||
and s.id not in (
|
||||
select service from dsm_api where coalesce(compatibility_override, compatibility) like '%openaire%' or last_collection_total > 0
|
||||
);
|
|
@ -1048,10 +1048,5 @@
|
|||
"openaire_id": "re3data_____::r3d100010399",
|
||||
"datacite_name": "ZEW Forschungsdatenzentrum",
|
||||
"official_name": "ZEW Forschungsdatenzentrum"
|
||||
},
|
||||
"HBP.NEUROINF": {
|
||||
"openaire_id": "fairsharing_::2975",
|
||||
"datacite_name": "EBRAINS",
|
||||
"official_name": "EBRAINS"
|
||||
}
|
||||
}
|
|
@ -6,7 +6,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
@ -22,7 +21,6 @@ import org.apache.spark.sql.Encoders;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.Attribute;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
|
@ -119,7 +117,7 @@ public class BaseCollectorIteratorTest {
|
|||
final List<BaseRecordInfo> ls = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ls.add(extractInfo(xml));
|
||||
ls.add(BaseAnalyzerJob.extractInfo(xml));
|
||||
}
|
||||
|
||||
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
|
||||
|
@ -133,52 +131,4 @@ public class BaseCollectorIteratorTest {
|
|||
|
||||
df.show(false);
|
||||
}
|
||||
|
||||
private BaseRecordInfo extractInfo(final String s) {
|
||||
try {
|
||||
final Document record = DocumentHelper.parseText(s);
|
||||
|
||||
final BaseRecordInfo info = new BaseRecordInfo();
|
||||
|
||||
final Set<String> paths = new LinkedHashSet<>();
|
||||
final Set<String> types = new LinkedHashSet<>();
|
||||
final List<BaseCollectionInfo> colls = new ArrayList<>();
|
||||
|
||||
for (final Object o : record.selectNodes("//*|//@*")) {
|
||||
paths.add(((Node) o).getPath());
|
||||
|
||||
if (o instanceof Element) {
|
||||
final Element n = (Element) o;
|
||||
|
||||
final String nodeName = n.getName();
|
||||
|
||||
if ("collection".equals(nodeName)) {
|
||||
final String collName = n.getText().trim();
|
||||
|
||||
if (StringUtils.isNotBlank(collName)) {
|
||||
final BaseCollectionInfo coll = new BaseCollectionInfo();
|
||||
coll.setId(collName);
|
||||
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
|
||||
coll.setRorId(n.valueOf("@ror_id").trim());
|
||||
colls.add(coll);
|
||||
}
|
||||
} else if ("type".equals(nodeName)) {
|
||||
types.add("TYPE: " + n.getText().trim());
|
||||
} else if ("typenorm".equals(nodeName)) {
|
||||
types.add("TYPE_NORM: " + n.getText().trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info.setId(record.valueOf("//*[local-name() = 'header']/*[local-name() = 'identifier']").trim());
|
||||
info.getTypes().addAll(types);
|
||||
info.getPaths().addAll(paths);
|
||||
info.setCollections(colls);
|
||||
|
||||
return info;
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.base;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.base;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -66,9 +65,9 @@ public class BaseTransfomationTest extends AbstractVocabularyTest {
|
|||
|
||||
private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception {
|
||||
final String xslt = new SAXReader()
|
||||
.read(this.getClass().getResourceAsStream(path))
|
||||
.selectSingleNode("//CODE/*")
|
||||
.asXML();
|
||||
.read(this.getClass().getResourceAsStream(path))
|
||||
.selectSingleNode("//CODE/*")
|
||||
.asXML();
|
||||
|
||||
final LongAccumulator la = new LongAccumulator();
|
||||
|
||||
|
|
|
@ -122,41 +122,22 @@ public class DedupRecordFactory {
|
|||
}
|
||||
|
||||
return Stream
|
||||
.concat(
|
||||
Stream
|
||||
.of(agg.getDedupId())
|
||||
.map(id -> createDedupOafEntity(id, agg.entity, dataInfo, ts)),
|
||||
agg.aliases
|
||||
.stream()
|
||||
.map(id -> createMergedDedupAliasOafEntity(id, agg.entity, dataInfo, ts)))
|
||||
.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
|
||||
.map(id -> {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
|
||||
res.setId(id);
|
||||
res.setDataInfo(dataInfo);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.iterator();
|
||||
}, beanEncoder);
|
||||
}
|
||||
|
||||
private static OafEntity createDedupOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
|
||||
try {
|
||||
OafEntity res = (OafEntity) BeanUtils.cloneBean(base);
|
||||
res.setId(id);
|
||||
res.setDataInfo(dataInfo);
|
||||
res.setLastupdatetimestamp(ts);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static OafEntity createMergedDedupAliasOafEntity(String id, OafEntity base, DataInfo dataInfo, long ts) {
|
||||
try {
|
||||
OafEntity res = createDedupOafEntity(id, base, dataInfo, ts);
|
||||
DataInfo ds = (DataInfo) BeanUtils.cloneBean(dataInfo);
|
||||
ds.setDeletedbyinference(true);
|
||||
res.setDataInfo(ds);
|
||||
return res;
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
|
||||
|
||||
if (duplicate == null) {
|
||||
|
|
|
@ -15,12 +15,4 @@
|
|||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>pivotHistoryDatabase</name>
|
||||
<value>​</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -198,8 +198,6 @@
|
|||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
|
||||
</spark>
|
||||
<ok to="PrepareOrgRels"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -73,6 +73,12 @@
|
|||
"name": "Irish Nephrology Society",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011062",
|
||||
"uri": "http://dx.doi.org/10.13039/100011062",
|
||||
"name": "Asian Spinal Cord Network",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100011096",
|
||||
"uri": "http://dx.doi.org/10.13039/100011096",
|
||||
|
@ -217,6 +223,12 @@
|
|||
"name": "Global Brain Health Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015776",
|
||||
"uri": "http://dx.doi.org/10.13039/100015776",
|
||||
"name": "Health and Social Care Board",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "100015992",
|
||||
"uri": "http://dx.doi.org/10.13039/100015992",
|
||||
|
@ -391,6 +403,18 @@
|
|||
"name": "Irish Hospice Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001596",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001596",
|
||||
"name": "Irish Research Council for Science, Engineering and Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001597",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001597",
|
||||
"name": "Irish Research Council for the Humanities and Social Sciences",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100001598",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001598",
|
||||
|
@ -491,7 +515,7 @@
|
|||
"id": "501100002081",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002081",
|
||||
"name": "Irish Research Council",
|
||||
"synonym": ["501100001596", "501100001597"]
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100002736",
|
||||
|
|
|
@ -587,15 +587,7 @@ case object Crossref2Oaf {
|
|||
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
|
||||
"10.13039/501100013589" | "10.13039/501100000271" =>
|
||||
generateSimpleRelationFromAward(funder, "ukri________", a => a)
|
||||
//HFRI
|
||||
case "10.13039/501100013209" =>
|
||||
generateSimpleRelationFromAward(funder, "hfri________", a => a)
|
||||
val targetId = getProjectId("hfri________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
//ERASMUS+
|
||||
case "10.13039/501100010790" =>
|
||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||
|
||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||
|
||||
}
|
||||
|
|
|
@ -23,10 +23,15 @@ class CrossrefMappingTest {
|
|||
val mapper = new ObjectMapper()
|
||||
|
||||
@Test
|
||||
def testMissingAuthorParser():Unit = {
|
||||
val json: String = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/s41567-022-01757-y.json")).mkString
|
||||
def testMissingAuthorParser(): Unit = {
|
||||
val json: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/s41567-022-01757-y.json"))
|
||||
.mkString
|
||||
val result = Crossref2Oaf.convert(json)
|
||||
result.filter(o => o.isInstanceOf[Publication]).map(p=> p.asInstanceOf[Publication]).foreach(p =>assertTrue(p.getAuthor.size()>0))
|
||||
result
|
||||
.filter(o => o.isInstanceOf[Publication])
|
||||
.map(p => p.asInstanceOf[Publication])
|
||||
.foreach(p => assertTrue(p.getAuthor.size() > 0))
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -53,8 +53,6 @@ public class Constraints implements Serializable {
|
|||
|
||||
for (Constraint sc : constraint) {
|
||||
boolean verified = false;
|
||||
if(!param.containsKey(sc.getField()))
|
||||
return false;
|
||||
for (String value : param.get(sc.getField())) {
|
||||
if (sc.verifyCriteria(value.trim())) {
|
||||
verified = true;
|
||||
|
|
|
@ -317,7 +317,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
listKeyValues(
|
||||
createOpenaireId(10, rs.getString("collectedfromid"), true),
|
||||
rs.getString("collectedfromname")));
|
||||
p.setPid(prepareListOfStructProps(rs.getArray("pid"), info));
|
||||
p.setPid(new ArrayList<>());
|
||||
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
|
||||
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
|
|
|
@ -238,23 +238,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
(Element) doc
|
||||
.selectSingleNode(
|
||||
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
|
||||
.map(e -> {
|
||||
final String resourceTypeURI = Optional
|
||||
.ofNullable(e.attributeValue("uri"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(null);
|
||||
final String resourceTypeAnyURI = Optional
|
||||
.ofNullable(e.attributeValue("anyURI"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(null);
|
||||
final String resourceTypeTxt = Optional
|
||||
.ofNullable(e.getText())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(null);
|
||||
final String resourceTypeGeneral = Optional
|
||||
.ofNullable(e.attributeValue("resourceTypeGeneral"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.orElse(null);
|
||||
.map(element -> {
|
||||
final String resourceTypeURI = element.attributeValue("uri");
|
||||
final String resourceTypeAnyURI = element.attributeValue("anyURI");
|
||||
final String resourceTypeTxt = element.getText();
|
||||
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
|
||||
|
||||
return ObjectUtils
|
||||
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
|
||||
|
|
|
@ -33,7 +33,7 @@ SELECT
|
|||
dc.officialname AS collectedfromname,
|
||||
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
|
||||
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
|
||||
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ SELECT
|
|||
dc.officialname AS collectedfromname,
|
||||
p.contracttypeclass || '@@@' || p.contracttypescheme AS contracttype,
|
||||
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
|
||||
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
FROM projects p
|
||||
|
|
|
@ -93,8 +93,8 @@ object CopyHdfsOafSparkApplication {
|
|||
hasSource != null && hasTarget != null
|
||||
} else {
|
||||
val hasId = (json \ "id").extractOrElse[String](null)
|
||||
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String]("")
|
||||
hasId != null && oafType.startsWith(resultType)
|
||||
val resultType = (json \ "resulttype" \ "classid").extractOrElse[String](null)
|
||||
hasId != null && oafType.equalsIgnoreCase(resultType)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -59,19 +59,7 @@ public class CopyHdfsOafSparkApplicationTest {
|
|||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/raw/publication_2_unknownProperty.json")),
|
||||
"publication"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void isOafType_Datacite_ORP() throws IOException {
|
||||
assertTrue(
|
||||
CopyHdfsOafSparkApplication
|
||||
.isOafType(
|
||||
IOUtils
|
||||
.toString(
|
||||
getClass()
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/raw/datacite_orp.json")),
|
||||
"otherresearchproduct"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1171,34 +1171,6 @@ class MappersTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
void test_Zenodo2() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo2.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
Publication p = cleanup((Publication) list.get(0), vocs);
|
||||
|
||||
assertNotNull(p.getInstance());
|
||||
assertEquals(1, p.getInstance().size());
|
||||
|
||||
final Instance instance = p.getInstance().get(0);
|
||||
|
||||
assertNotNull(instance.getInstanceTypeMapping());
|
||||
assertEquals(1, instance.getInstanceTypeMapping().size());
|
||||
|
||||
Optional<InstanceTypeMapping> coarType = instance
|
||||
.getInstanceTypeMapping()
|
||||
.stream()
|
||||
.filter(itm -> ModelConstants.OPENAIRE_COAR_RESOURCE_TYPES_3_1.equals(itm.getVocabularyName()))
|
||||
.findFirst();
|
||||
|
||||
assertTrue(coarType.isPresent());
|
||||
assertNotNull(coarType.get().getOriginalType());
|
||||
assertNull(coarType.get().getTypeCode());
|
||||
assertNull(coarType.get().getTypeLabel());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testROHub2() throws IOException {
|
||||
final String xml = IOUtils
|
||||
|
@ -1257,7 +1229,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testD4ScienceTraining() throws IOException {
|
||||
public void testD4ScienceTraining() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-1-training.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -1268,7 +1240,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testD4ScienceDataset() throws IOException {
|
||||
public void testD4ScienceDataset() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("d4science-2-dataset.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,59 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-3"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri">
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<identifier>oai:zenodo.org:1596086</identifier>
|
||||
<datestamp>2020-01-20T13:50:28Z</datestamp>
|
||||
<setSpec>openaire</setSpec>
|
||||
<dr:dateOfTransformation>2024-02-08T11:03:10.994Z</dr:dateOfTransformation>
|
||||
<dri:objIdentifier>od______2659::036d5555a6688ed00c8d0da97bdece3b</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2024-02-08T11:03:10.994Z</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>2024-02-08T11:03:10.994Z</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<resource xmlns="http://datacite.org/schema/kernel-4"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
|
||||
<identifier identifierType="URL">https://zenodo.org/record/1596086</identifier>
|
||||
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3"/>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Bonney, T. G.</creatorName>
|
||||
<givenName>T. G.</givenName>
|
||||
<familyName>Bonney</familyName>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>Ice Blocks on a Moraine</title>
|
||||
</titles>
|
||||
<publisher>Zenodo</publisher>
|
||||
<publicationYear>1889</publicationYear>
|
||||
<dates>
|
||||
<date dateType="Issued">1889-08-22</date>
|
||||
</dates>
|
||||
<resourceType resourceTypeGeneral="JournalArticle"/>
|
||||
<relatedIdentifiers>
|
||||
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsIdenticalTo"
|
||||
>10.1038/040391a0</relatedIdentifier>
|
||||
</relatedIdentifiers>
|
||||
<rightsList>
|
||||
<rights rightsURI="https://creativecommons.org/publicdomain/zero/1.0/legalcode"
|
||||
>Creative Commons Zero v1.0 Universal</rights>
|
||||
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
|
||||
</rightsList>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract">n/a</description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>1889-08-22</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>http://creativecommons.org/publicdomain/zero/1.0/legalcode</oaf:license>
|
||||
<oaf:language/>
|
||||
<oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
|
||||
<oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
|
||||
</metadata>
|
||||
</record>
|
|
@ -185,7 +185,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -213,7 +212,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -241,7 +239,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -269,7 +266,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -297,7 +293,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -325,7 +320,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -353,7 +347,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -393,7 +386,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -422,7 +414,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -451,7 +442,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -480,7 +470,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -509,7 +498,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -538,7 +526,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -567,7 +554,6 @@
|
|||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -1,32 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<version>2.1.11</version>
|
||||
<configuration>
|
||||
<failOnNoGitDirectory>false</failOnNoGitDirectory>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -1,30 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>${jobTracker}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>${nameNode}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,223 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export HADOOP_USER_NAME=$2
|
||||
|
||||
|
||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||
|
||||
IMPALA_HDFS_NODE=''
|
||||
COUNTER=0
|
||||
while [ $COUNTER -lt 3 ]; do
|
||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||
break
|
||||
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
|
||||
break
|
||||
else
|
||||
IMPALA_HDFS_NODE=''
|
||||
sleep 1
|
||||
fi
|
||||
((COUNTER++))
|
||||
done
|
||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||
|
||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||
|
||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||
|
||||
|
||||
# Set sed arguments.
|
||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||
|
||||
# Set the SED command arguments for column-names with reserved words:
|
||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||
|
||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||
|
||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||
|
||||
|
||||
function copydb() {
|
||||
db=$1
|
||||
echo -e "\nStart processing db: '${db}'..\n"
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
# Check the exit status of the "hadoop distcp" command.
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||
|
||||
echo -e "\nCreating schema for db: '${db}'\n"
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||
|
||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||
if [ -n "$create_view_statement_test" ]; then
|
||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||
}
|
||||
|
||||
|
||||
MONITOR_DB=$1
|
||||
#HADOOP_USER_NAME=$2
|
||||
copydb $MONITOR_DB
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
SOURCE=$1
|
||||
PRODUCTION=$2
|
||||
SHADOW=$3
|
||||
MONITOR_PROD=$4
|
||||
MONITOR_IRISH_PROD=$5
|
||||
|
||||
|
||||
echo ${SOURCE}
|
||||
echo ${PRODUCTION}
|
||||
|
||||
#echo "Updating ${PRODUCTION} monitor database old cluster"
|
||||
#impala-shell -q "create database if not exists ${PRODUCTION}"
|
||||
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
|
||||
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
|
||||
|
||||
echo "Updating ${PRODUCTION} historical snapshots database"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
echo "Production monitor db ready!"
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_PROD}.historical_snapshots_fos"
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots as select * from ${SOURCE}.historical_snapshots"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_PROD}.historical_snapshots_fos as select * from ${SOURCE}.historical_snapshots_fos"
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "drop view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos"
|
||||
|
||||
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish as select * from ${SOURCE}.historical_snapshots_irish"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create view ${MONITOR_IRISH_PROD}.historical_snapshots_irish_fos as select * from ${SOURCE}.historical_snapshots_irish"
|
|
@ -1,27 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
#update Monitor DB IRISH
|
||||
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
|
||||
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
echo "Hive shell finished"
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
|
||||
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_fos;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_fos_tmp
|
||||
select
|
||||
cast(${hist_date} as STRING),
|
||||
count(distinct r.id),
|
||||
r.type,
|
||||
rf.lvl1,
|
||||
rf.lvl2,
|
||||
pf.publicly_funded,
|
||||
r.access_mode,
|
||||
r.gold,
|
||||
r.green,
|
||||
coalesce(gl.green_with_license,0),
|
||||
h.is_hybrid,
|
||||
b.is_bronze_oa,
|
||||
d.in_diamond_journal,
|
||||
t.is_transformative,
|
||||
pr.refereed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.result_fos rf on rf.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
|
||||
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
|
||||
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_fos purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_fos STORED AS PARQUET AS
|
||||
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_tmp;
|
||||
|
||||
drop table if exists ${monitor_db_name}.historical_snapshots_fos purge;
|
||||
|
||||
create table ${monitor_db_name}.historical_snapshots_fos stored as parquet
|
||||
as select * from ${hist_db_name}.historical_snapshots_fos;
|
||||
|
||||
drop table ${hist_db_name}.historical_snapshots_fos_tmp purge;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_tmp as
|
||||
SELECT * FROM ${hist_db_name_prev}.historical_snapshots;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_tmp
|
||||
select
|
||||
cast(${hist_date} as STRING),
|
||||
count(distinct r.id),
|
||||
r.type,
|
||||
pf.publicly_funded,
|
||||
r.access_mode,
|
||||
r.gold,
|
||||
r.green,
|
||||
coalesce(gl.green_with_license,0),
|
||||
h.is_hybrid,
|
||||
b.is_bronze_oa,
|
||||
d.in_diamond_journal,
|
||||
t.is_transformative,
|
||||
pr.refereed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_green_with_license gl on gl.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_bronze_oa b on b.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_diamond d on d.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_in_transformative t on t.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_hybrid h on h.id=r.id
|
||||
left outer join ${stats_db_name}.result_refereed pr on pr.id=r.id
|
||||
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots STORED AS PARQUET AS
|
||||
SELECT * FROM ${hist_db_name}.historical_snapshots_tmp;
|
||||
|
||||
drop table if exists ${monitor_db_name}.historical_snapshots purge;
|
||||
|
||||
create table ${monitor_db_name}.historical_snapshots stored as parquet
|
||||
as select * from ${hist_db_name}.historical_snapshots;
|
||||
|
||||
drop table ${hist_db_name}.historical_snapshots_tmp purge;
|
|
@ -1,91 +0,0 @@
|
|||
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
|
||||
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish_fos;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_fos_irish_tmp
|
||||
select
|
||||
cast(${hist_date} as STRING),
|
||||
count(distinct r.id),
|
||||
r.type,
|
||||
rf.lvl1,
|
||||
rf.lvl2,
|
||||
pf.publicly_funded,
|
||||
r.access_mode,
|
||||
r.gold,
|
||||
r.green,
|
||||
coalesce(gl.green_with_license,0),
|
||||
h.is_hybrid,
|
||||
b.is_bronze_oa,
|
||||
d.in_diamond_journal,
|
||||
t.is_transformative,
|
||||
pr.refereed
|
||||
from ${stats_irish_db_name}.result r
|
||||
left outer join ${stats_irish_db_name}.result_fos rf on rf.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
|
||||
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
|
||||
group by r.green, r.gold, r.access_mode, r.type, rf.lvl1,rf.lvl2, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_irish_fos purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_fos STORED AS PARQUET AS
|
||||
SELECT * FROM ${hist_db_name}.historical_snapshots_fos_irish_tmp;
|
||||
|
||||
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
|
||||
|
||||
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
|
||||
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
|
||||
|
||||
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
|
||||
SELECT * FROM ${hist_db_name_prev}.historical_snapshots_irish;
|
||||
|
||||
INSERT INTO ${hist_db_name}.historical_snapshots_irish_tmp
|
||||
select
|
||||
cast(${hist_date} as STRING),
|
||||
count(distinct r.id),
|
||||
r.type,
|
||||
pf.publicly_funded,
|
||||
r.access_mode,
|
||||
r.gold,
|
||||
r.green,
|
||||
coalesce(gl.green_with_license,0),
|
||||
h.is_hybrid,
|
||||
b.is_bronze_oa,
|
||||
d.in_diamond_journal,
|
||||
t.is_transformative,
|
||||
pr.refereed
|
||||
from ${stats_irish_db_name}.result r
|
||||
left outer join ${stats_irish_db_name}.indi_pub_publicly_funded pf on pf.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_green_with_license gl on gl.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_bronze_oa b on b.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_diamond d on d.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_in_transformative t on t.id=r.id
|
||||
left outer join ${stats_irish_db_name}.indi_pub_hybrid h on h.id=r.id
|
||||
left outer join ${stats_irish_db_name}.result_refereed pr on pr.id=r.id
|
||||
group by r.green, r.gold, r.access_mode, r.type, pf.publicly_funded,r.green, gl.green_with_license,b.is_bronze_oa,d.in_diamond_journal,t.is_transformative,h.is_hybrid,pr.refereed;
|
||||
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_irish purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_irish STORED AS PARQUET AS
|
||||
SELECT * FROM ${hist_db_name}.historical_snapshots_irish_tmp;
|
||||
|
||||
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish purge;
|
||||
|
||||
create table ${monitor_irish_db_name}.historical_snapshots_irish stored as parquet
|
||||
as select * from ${hist_db_name}.historical_snapshots_irish;
|
||||
|
||||
drop table ${hist_db_name}.historical_snapshots_irish_tmp purge;
|
||||
|
||||
|
||||
drop table if exists ${monitor_irish_db_name}.historical_snapshots_irish_fos purge;
|
||||
|
||||
create table ${monitor_irish_db_name}.historical_snapshots_irish_fos stored as parquet
|
||||
as select * from ${hist_db_name}.historical_snapshots_irish_fos;
|
||||
|
||||
drop table ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
|
|
@ -1,92 +0,0 @@
|
|||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Historical Snapshots database creation
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
DROP database IF EXISTS ${hist_db_name} CASCADE;
|
||||
CREATE database ${hist_db_name};
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_fos_tmp purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_tmp
|
||||
(
|
||||
hist_date STRING,
|
||||
total INT,
|
||||
type STRING,
|
||||
lvl1 STRING,
|
||||
lvl2 STRING,
|
||||
publicly_funded INT,
|
||||
accessrights STRING,
|
||||
gold INT,
|
||||
green INT,
|
||||
green_with_license INT,
|
||||
hybrid INT,
|
||||
bronze INT,
|
||||
diamond INT,
|
||||
transformative INT,
|
||||
peer_reviewed STRING
|
||||
)
|
||||
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_fos_irish_tmp purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_fos_irish_tmp
|
||||
(
|
||||
hist_date STRING,
|
||||
total INT,
|
||||
type STRING,
|
||||
lvl1 STRING,
|
||||
lvl2 STRING,
|
||||
publicly_funded INT,
|
||||
accessrights STRING,
|
||||
gold INT,
|
||||
green INT,
|
||||
green_with_license INT,
|
||||
hybrid INT,
|
||||
bronze INT,
|
||||
diamond INT,
|
||||
transformative INT,
|
||||
peer_reviewed STRING
|
||||
)
|
||||
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_tmp purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_tmp
|
||||
(
|
||||
hist_date STRING,
|
||||
total INT,
|
||||
type STRING,
|
||||
publicly_funded INT,
|
||||
accessrights STRING,
|
||||
gold INT,
|
||||
green INT,
|
||||
green_with_license INT,
|
||||
hybrid INT,
|
||||
bronze INT,
|
||||
diamond INT,
|
||||
transformative INT,
|
||||
peer_reviewed STRING
|
||||
)
|
||||
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||
|
||||
drop table if exists ${hist_db_name}.historical_snapshots_irish_tmp purge;
|
||||
|
||||
CREATE TABLE ${hist_db_name}.historical_snapshots_irish_tmp
|
||||
(
|
||||
hist_date STRING,
|
||||
total INT,
|
||||
type STRING,
|
||||
publicly_funded INT,
|
||||
accessrights STRING,
|
||||
gold INT,
|
||||
green INT,
|
||||
green_with_license INT,
|
||||
hybrid INT,
|
||||
bronze INT,
|
||||
diamond INT,
|
||||
transformative INT,
|
||||
peer_reviewed STRING
|
||||
)
|
||||
CLUSTERED BY (hist_date) INTO 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
|
@ -1,159 +0,0 @@
|
|||
<workflow-app name="Stats Hist Snapshots" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>hist_db_name</name>
|
||||
<description>the target hist database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hist_db_name_prev</name>
|
||||
<description>the hist database name of previous_month</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
<description>the stats db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_irish_db_name</name>
|
||||
<description>the stats irish db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_db_name</name>
|
||||
<description>the monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_irish_db_name</name>
|
||||
<description>the irish monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hist_db_prod_name</name>
|
||||
<description>the production db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hist_db_shadow_name</name>
|
||||
<description>the production shadow db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hist_date</name>
|
||||
<description>the snaps date</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_timeout</name>
|
||||
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hadoop_user_name</name>
|
||||
<description>user name of the wf owner</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>analytics</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="CreateDB">${wf:conf('resumeFrom') eq 'CreateDB'}</case>
|
||||
<case to="BuildHistSnaps">${wf:conf('resumeFrom') eq 'BuildHistSnaps'}</case>
|
||||
<case to="BuildHistSnapsIrish">${wf:conf('resumeFrom') eq 'BuildHistSnapsIrish'}</case>
|
||||
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
|
||||
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
|
||||
<default to="BuildHistSnaps"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="CreateDB">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/CreateDB.sql</script>
|
||||
<param>hist_db_name=${hist_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="BuildHistSnaps"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="BuildHistSnaps">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/BuildHistSnapsAll.sql</script>
|
||||
<param>hist_db_name=${hist_db_name}</param>
|
||||
<param>hist_db_name_prev=${hist_db_name_prev}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>monitor_db_name=${monitor_db_name}</param>
|
||||
<param>hist_date=${hist_date}</param>
|
||||
</hive2>
|
||||
<ok to="BuildHistSnapsIrish"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="BuildHistSnapsIrish">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/BuildHistSnapsIrish.sql</script>
|
||||
<param>hist_db_name=${hist_db_name}</param>
|
||||
<param>hist_db_name_prev=${hist_db_name_prev}</param>
|
||||
<param>stats_irish_db_name=${stats_irish_db_name}</param>
|
||||
<param>monitor_irish_db_name=${monitor_irish_db_name}</param>
|
||||
<param>hist_date=${hist_date}</param>
|
||||
</hive2>
|
||||
<ok to="Step2-copyDataToImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="Step2-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<argument>${hist_db_name}</argument>
|
||||
<argument>${hadoop_user_name}</argument>
|
||||
<file>copyDataToImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="Step3-finalizeImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="Step3-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
<argument>${hist_db_name}</argument>
|
||||
<argument>${hist_db_prod_name}</argument>
|
||||
<argument>${hist_db_shadow_name}</argument>
|
||||
<argument>${monitor_db_prod_name}</argument>
|
||||
<argument>${monitor_irish_db_prod_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,32 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<version>2.1.11</version>
|
||||
<configuration>
|
||||
<failOnNoGitDirectory>false</failOnNoGitDirectory>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -1,30 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>${jobTracker}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>${nameNode}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,222 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export HADOOP_USER_NAME=$2
|
||||
|
||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||
|
||||
IMPALA_HDFS_NODE=''
|
||||
COUNTER=0
|
||||
while [ $COUNTER -lt 3 ]; do
|
||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||
break
|
||||
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
|
||||
break
|
||||
else
|
||||
IMPALA_HDFS_NODE=''
|
||||
sleep 1
|
||||
fi
|
||||
((COUNTER++))
|
||||
done
|
||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||
|
||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||
|
||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||
|
||||
|
||||
# Set sed arguments.
|
||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||
|
||||
# Set the SED command arguments for column-names with reserved words:
|
||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||
|
||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||
|
||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||
|
||||
|
||||
function copydb() {
|
||||
db=$1
|
||||
echo -e "\nStart processing db: '${db}'..\n"
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
# Check the exit status of the "hadoop distcp" command.
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||
|
||||
echo -e "\nCreating schema for db: '${db}'\n"
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||
|
||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||
if [ -n "$create_view_statement_test" ]; then
|
||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||
}
|
||||
|
||||
|
||||
MONITOR_DB=$1
|
||||
#HADOOP_USER_NAME=$2
|
||||
copydb $MONITOR_DB
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
SOURCE=$1
|
||||
PRODUCTION=$2
|
||||
echo ${SOURCE}
|
||||
echo ${PRODUCTION}
|
||||
|
||||
#echo "Updating ${PRODUCTION} monitor database old cluster"
|
||||
#impala-shell -q "create database if not exists ${PRODUCTION}"
|
||||
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
|
||||
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
|
||||
|
||||
echo "Updating ${PRODUCTION} monitor database"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
echo "Production monitor db ready!"
|
|
@ -1,28 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
export GRAPHDB=$5
|
||||
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
#update Monitor DB IRISH
|
||||
#cat CreateDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$3/g1" > foo
|
||||
cat buildIrishMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" | sed "s/GRAPHDB/$5/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
echo "Hive shell finished"
|
||||
|
|
@ -1,241 +0,0 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
|
||||
drop table if exists TARGET.irish_funders;
|
||||
|
||||
create TEMPORARY table TARGET.irish_funders as
|
||||
select distinct xpath_string(fundingtree[0].value, '//funder/name') as funder from GRAPHDB.project
|
||||
where xpath_string(fundingtree[0].value, '//funder/jurisdiction')='IE';
|
||||
--create TEMPORARY table TARGET.irish_funders as
|
||||
--select distinct name as funder from SOURCE.fundref where country='IE';
|
||||
|
||||
drop table if exists TARGET.result;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select r.*
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
|
||||
union all
|
||||
select r.*
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization and o.country='IE'
|
||||
union all
|
||||
select r.*
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_pids pid on pid.id=r.id
|
||||
join stats_ext.transformative_facts tf on tf.doi=pid.pid
|
||||
) foo;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
|
||||
create table TARGET.funder stored as parquet as select * from SOURCE.funder where country='IE';
|
||||
|
||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
|
||||
|
||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
|
||||
|
||||
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||
drop view TARGET.foo1;
|
||||
drop view TARGET.foo2;
|
||||
|
||||
-- datasources
|
||||
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
|
||||
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
|
||||
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
|
||||
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
|
||||
|
||||
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
|
||||
|
||||
-- organizations
|
||||
create view if not exists TARGET.organization as select * from SOURCE.organization;
|
||||
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
|
||||
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
|
||||
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
|
||||
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
|
||||
|
||||
-- projects
|
||||
create view if not exists TARGET.project as select * from SOURCE.project;
|
||||
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
|
||||
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
|
||||
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
|
||||
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
|
||||
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
|
||||
|
||||
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
|
||||
|
||||
|
||||
-- indicators
|
||||
-- Sprint 1 ----
|
||||
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
-- Sprint 2 ----
|
||||
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
---- Sprint 3 ----
|
||||
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
|
||||
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
|
||||
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
|
||||
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
|
||||
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
|
||||
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
|
||||
---- Sprint 4 ----
|
||||
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
---- Sprint 5 ----
|
||||
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
---- Sprint 6 ----
|
||||
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
||||
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
||||
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
||||
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
||||
---- Sprint 7 ----
|
||||
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
|
||||
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
|
||||
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
|
||||
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
|
||||
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
|
||||
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
|
||||
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
|
||||
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
|
||||
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
|
||||
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
create view TARGET.indi_is_funder_plan_s as select * from SOURCE.indi_is_funder_plan_s;
|
||||
create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
|
||||
create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
|
||||
create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
|
||||
create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
|
||||
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
|
||||
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
|
||||
|
||||
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
|
@ -1,118 +0,0 @@
|
|||
<workflow-app name="Irish Monitor Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
<description>the target stats database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graph_db_name</name>
|
||||
<description>the graph database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_irish_db_name</name>
|
||||
<description>the target monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_irish_db_prod_name</name>
|
||||
<description>the name of the production monitor db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_irish_db_shadow_name</name>
|
||||
<description>the name of the shadow monitor db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_timeout</name>
|
||||
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hadoop_user_name</name>
|
||||
<description>user name of the wf owner</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>analytics</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="Step1-buildIrishMonitorDB">${wf:conf('resumeFrom') eq 'Step1-buildIrishMonitorDB'}</case>
|
||||
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
|
||||
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
|
||||
<default to="Step1-buildIrishMonitorDB"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="Step1-buildIrishMonitorDB">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>monitor_irish.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${monitor_irish_db_name}</argument>
|
||||
<argument>${monitor_irish_db_shadow_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/buildIrishMonitorDB.sql</argument>
|
||||
<argument>${graph_db_name}</argument>
|
||||
<file>monitor_irish.sh</file>
|
||||
</shell>
|
||||
<ok to="Step2-copyDataToImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step2-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<argument>${monitor_irish_db_name}</argument>
|
||||
<argument>${hadoop_user_name}</argument>
|
||||
<file>copyDataToImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="Step3-finalizeImpalaCluster"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step3-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
<argument>${monitor_irish_db_name}</argument>
|
||||
<argument>${monitor_irish_db_prod_name}</argument>
|
||||
<argument>${monitor_irish_db_shadow_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,32 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-stats-monitor-update</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<version>2.1.11</version>
|
||||
<configuration>
|
||||
<failOnNoGitDirectory>false</failOnNoGitDirectory>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -1,30 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>${jobTracker}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>${nameNode}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=22166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=15596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,223 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export HADOOP_USER_NAME=$2
|
||||
|
||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||
|
||||
IMPALA_HDFS_NODE=''
|
||||
COUNTER=0
|
||||
while [ $COUNTER -lt 3 ]; do
|
||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||
break
|
||||
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
|
||||
break
|
||||
else
|
||||
IMPALA_HDFS_NODE=''
|
||||
sleep 1
|
||||
fi
|
||||
((COUNTER++))
|
||||
done
|
||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||
|
||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||
|
||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||
|
||||
|
||||
# Set sed arguments.
|
||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||
|
||||
# Set the SED command arguments for column-names with reserved words:
|
||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||
|
||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||
|
||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||
|
||||
|
||||
function copydb() {
|
||||
db=$1
|
||||
echo -e "\nStart processing db: '${db}'..\n"
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
# Check the exit status of the "hadoop distcp" command.
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||
|
||||
echo -e "\nCreating schema for db: '${db}'\n"
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||
|
||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||
if [ -n "$create_view_statement_test" ]; then
|
||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||
}
|
||||
|
||||
|
||||
MONITOR_DB=$1
|
||||
|
||||
copydb $MONITOR_DB'_institutions'
|
||||
copydb $MONITOR_DB
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
#
|
||||
#function createShadowDB() {
|
||||
# SOURCE=$1
|
||||
# SHADOW=$2
|
||||
#
|
||||
# # drop views from db
|
||||
# for i in `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} --delimited -q "show tables"`;
|
||||
# do
|
||||
# `impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "drop view $i;"`;
|
||||
# done
|
||||
#
|
||||
# impala-shell -i impala-cluster-dn1.openaire.eu -q "drop database ${SHADOW} CASCADE";
|
||||
# impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${SHADOW}";
|
||||
## impala-shell -i impala-cluster-dn1.openaire.eu -d ${SHADOW} -q "show tables" | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
# impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -f -
|
||||
#}
|
||||
#
|
||||
#MONITOR_DB=$1
|
||||
#MONITOR_DB_SHADOW=$2
|
||||
#
|
||||
#createShadowDB $MONITOR_DB'_institutions' $MONITOR_DB'_institutions_shadow'
|
||||
#createShadowDB $MONITOR_DB $MONITOR_DB'_shadow'
|
||||
|
||||
SOURCE=$1
|
||||
PRODUCTION=$2
|
||||
echo ${SOURCE}
|
||||
echo ${PRODUCTION}
|
||||
|
||||
#echo "Updating ${PRODUCTION} monitor database old cluster"
|
||||
#impala-shell -q "create database if not exists ${PRODUCTION}"
|
||||
#impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f -
|
||||
#impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f -
|
||||
#
|
||||
#echo "Updating ${PRODUCTION}_institutions database old cluster"
|
||||
#impala-shell -q "create database if not exists ${PRODUCTION}_institutions"
|
||||
#impala-shell -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -c -f -
|
||||
#impala-shell -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -c -f -
|
||||
#echo "Production insitutions db ready!"
|
||||
|
||||
echo "Updating ${PRODUCTION} monitor database"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
echo "Production monitor db ready!"
|
||||
|
||||
echo "Updating ${PRODUCTION}_institutions database"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -q "create database if not exists ${PRODUCTION}_institutions"
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${PRODUCTION}_institutions -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}_institutions./" | sed "s/$/;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
impala-shell -i impala-cluster-dn1.openaire.eu -d ${SOURCE}_institutions -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}_institutions.\1 as select * from ${SOURCE}_institutions.\1;/" | impala-shell -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
echo "Production insitutions db ready!"
|
|
@ -1,60 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
export SCRIPT_PATH2=$5
|
||||
export SCRIPT_PATH2=$6
|
||||
|
||||
export HIVE_OPTS="-hiveconf mapred.job.queue.name=analytics -hiveconf hive.spark.client.connect.timeout=120000ms -hiveconf hive.spark.client.server.connect.timeout=300000ms -hiveconf spark.executor.memory=19166291558 -hiveconf spark.yarn.executor.memoryOverhead=3225 -hiveconf spark.driver.memory=11596411699 -hiveconf spark.yarn.driver.memoryOverhead=1228"
|
||||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
echo "Getting file from " $5
|
||||
hdfs dfs -copyToLocal $5
|
||||
|
||||
echo "Getting file from " $6
|
||||
hdfs dfs -copyToLocal $6
|
||||
|
||||
#update Monitor DB
|
||||
cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
#update Institutions DB
|
||||
cat updateMonitorDB_institutions.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
cat updateMonitorDB.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2_institutions/g1" > foo
|
||||
hive $HIVE_OPTS -f foo
|
||||
|
||||
|
||||
|
||||
echo "Hive shell finished"
|
||||
|
||||
#echo "Updating shadow monitor insitutions database"
|
||||
#hive -e "drop database if exists ${SHADOW}_institutions cascade"
|
||||
#hive -e "create database if not exists ${SHADOW}_institutions"
|
||||
#hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
|
||||
#hive -f foo
|
||||
#echo "Shadow db monitor insitutions ready!"
|
||||
#
|
||||
##update Monitor DB
|
||||
#cat updateMonitorDBAll.sql | sed "s/SOURCE/$1/g" | sed "s/TARGET/$2/g1" > foo
|
||||
#hive $HIVE_OPTS -f foo
|
||||
#
|
||||
#echo "Hive shell finished"
|
||||
#
|
||||
#echo "Updating shadow monitor database"
|
||||
#hive -e "drop database if exists ${SHADOW} cascade"
|
||||
#hive -e "create database if not exists ${SHADOW}"
|
||||
#hive $HIVE_OPTS --database ${2} -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${2}.\1;/" > foo
|
||||
#hive -f foo
|
||||
#echo "Shadow db monitor insitutions ready!"
|
|
@ -1,278 +0,0 @@
|
|||
--drop database if exists TARGET cascade;
|
||||
--create database if not exists TARGET;
|
||||
--
|
||||
--create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
--create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
--create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
--create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--
|
||||
--create table TARGET.result stored as parquet as
|
||||
-- select distinct * from (
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||
-- ) )) foo;
|
||||
--
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
|
||||
|
||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
|
||||
--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||
drop view TARGET.foo1;
|
||||
drop view TARGET.foo2;
|
||||
--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
|
||||
|
||||
-- datasources
|
||||
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
|
||||
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
|
||||
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
|
||||
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
|
||||
|
||||
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
|
||||
--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
|
||||
|
||||
-- organizations
|
||||
create view if not exists TARGET.organization as select * from SOURCE.organization;
|
||||
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
|
||||
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
|
||||
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
|
||||
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
|
||||
|
||||
-- projects
|
||||
create view if not exists TARGET.project as select * from SOURCE.project;
|
||||
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
|
||||
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
|
||||
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
|
||||
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
|
||||
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
|
||||
|
||||
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
|
||||
--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
|
||||
|
||||
-- indicators
|
||||
-- Sprint 1 ----
|
||||
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
|
||||
-- Sprint 2 ----
|
||||
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
|
||||
---- Sprint 3 ----
|
||||
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
|
||||
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
|
||||
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
|
||||
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
|
||||
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
|
||||
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
|
||||
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
|
||||
---- Sprint 4 ----
|
||||
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
|
||||
---- Sprint 5 ----
|
||||
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
|
||||
---- Sprint 6 ----
|
||||
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
|
||||
---- Sprint 7 ----
|
||||
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
|
||||
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
|
||||
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
|
||||
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
|
||||
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
|
||||
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
|
||||
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
|
||||
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
|
||||
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
|
||||
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
|
||||
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
|
||||
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
|
||||
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
|
@ -1,297 +0,0 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
union all
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
union all
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
|
||||
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique
|
||||
'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology
|
||||
'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam
|
||||
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
|
||||
'openorgs____::ab4ac74c35fa5dada770cf08e5110fab', -- Universidade Católica Portuguesa
|
||||
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
|
||||
'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University
|
||||
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
|
||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--create view if not exists TARGET.graduatedoctorates as select * from SOURCE.graduatedoctorates;
|
||||
|
||||
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_citations COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_references_oc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_citations_oc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_classifications COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_apc COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_concepts COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_datasources COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_fundercount COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_gold COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_greenoa COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_languages COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_licenses stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_licenses COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.licenses_normalized STORED AS PARQUET as select * from SOURCE.licenses_normalized;
|
||||
--ANALYZE TABLE TARGET.licenses_normalized COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_oids COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_organization COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_peerreviewed COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_pids COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_projectcount COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_projects COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_refereed COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_sources COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_topics COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_fos COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_accessroute stored as parquet as select * from SOURCE.result_accessroute orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_accessroute COMPUTE STATISTICS;
|
||||
|
||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
|
||||
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
|
||||
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
|
||||
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
|
||||
drop view TARGET.foo1;
|
||||
drop view TARGET.foo2;
|
||||
--ANALYZE TABLE TARGET.result_result COMPUTE STATISTICS;
|
||||
|
||||
-- datasources
|
||||
create view if not exists TARGET.datasource as select * from SOURCE.datasource;
|
||||
create view if not exists TARGET.datasource_oids as select * from SOURCE.datasource_oids;
|
||||
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
|
||||
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
|
||||
|
||||
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
|
||||
--ANALYZE TABLE TARGET.datasource_results COMPUTE STATISTICS;
|
||||
|
||||
-- organizations
|
||||
create view if not exists TARGET.organization as select * from SOURCE.organization;
|
||||
create view if not exists TARGET.organization_datasources as select * from SOURCE.organization_datasources;
|
||||
create view if not exists TARGET.organization_pids as select * from SOURCE.organization_pids;
|
||||
create view if not exists TARGET.organization_projects as select * from SOURCE.organization_projects;
|
||||
create view if not exists TARGET.organization_sources as select * from SOURCE.organization_sources;
|
||||
|
||||
-- projects
|
||||
create view if not exists TARGET.project as select * from SOURCE.project;
|
||||
create view if not exists TARGET.project_oids as select * from SOURCE.project_oids;
|
||||
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
|
||||
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
|
||||
create view if not exists TARGET.project_classification as select * from SOURCE.project_classification;
|
||||
create view if not exists TARGET.project_organization_contribution as select * from SOURCE.project_organization_contribution;
|
||||
|
||||
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
|
||||
--ANALYZE TABLE TARGET.project_results COMPUTE STATISTICS;
|
||||
|
||||
-- indicators
|
||||
-- Sprint 1 ----
|
||||
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_green_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_grey_lit COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_doi_from_crossref COMPUTE STATISTICS;
|
||||
-- Sprint 2 ----
|
||||
create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_has_cc_licence COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_has_cc_licence_url COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_has_abstract COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_with_orcid COMPUTE STATISTICS;
|
||||
---- Sprint 3 ----
|
||||
create table TARGET.indi_funded_result_with_fundref stored as parquet as select * from SOURCE.indi_funded_result_with_fundref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_funded_result_with_fundref COMPUTE STATISTICS;
|
||||
create view TARGET.indi_result_org_collab as select * from SOURCE.indi_result_org_collab;
|
||||
create view TARGET.indi_result_org_country_collab as select * from SOURCE.indi_result_org_country_collab;
|
||||
create view TARGET.indi_project_collab_org as select * from SOURCE.indi_project_collab_org;
|
||||
create view TARGET.indi_project_collab_org_country as select * from SOURCE.indi_project_collab_org_country;
|
||||
create view TARGET.indi_funder_country_collab as select * from SOURCE.indi_funder_country_collab;
|
||||
create view TARGET.indi_result_country_collab as select * from SOURCE.indi_result_country_collab;
|
||||
---- Sprint 4 ----
|
||||
create table TARGET.indi_pub_diamond stored as parquet as select * from SOURCE.indi_pub_diamond orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_diamond COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_in_transformative stored as parquet as select * from SOURCE.indi_pub_in_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_in_transformative COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_closed_other_open stored as parquet as select * from SOURCE.indi_pub_closed_other_open orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_closed_other_open COMPUTE STATISTICS;
|
||||
---- Sprint 5 ----
|
||||
create table TARGET.indi_result_no_of_copies stored as parquet as select * from SOURCE.indi_result_no_of_copies orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_no_of_copies COMPUTE STATISTICS;
|
||||
---- Sprint 6 ----
|
||||
create table TARGET.indi_pub_hybrid_oa_with_cc stored as parquet as select * from SOURCE.indi_pub_hybrid_oa_with_cc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_hybrid_oa_with_cc COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_bronze_oa stored as parquet as select * from SOURCE.indi_pub_bronze_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_bronze_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads stored as parquet as select * from SOURCE.indi_pub_downloads orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_datasource stored as parquet as select * from SOURCE.indi_pub_downloads_datasource orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_year stored as parquet as select * from SOURCE.indi_pub_downloads_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_year COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_downloads_datasource_year stored as parquet as select * from SOURCE.indi_pub_downloads_datasource_year orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_downloads_datasource_year COMPUTE STATISTICS;
|
||||
---- Sprint 7 ----
|
||||
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_gold_oa COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_hybrid stored as parquet as select * from SOURCE.indi_pub_hybrid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_hybrid COMPUTE STATISTICS;
|
||||
create view TARGET.indi_org_fairness as select * from SOURCE.indi_org_fairness;
|
||||
create view TARGET.indi_org_fairness_pub_pr as select * from SOURCE.indi_org_fairness_pub_pr;
|
||||
create view TARGET.indi_org_fairness_pub_year as select * from SOURCE.indi_org_fairness_pub_year;
|
||||
create view TARGET.indi_org_fairness_pub as select * from SOURCE.indi_org_fairness_pub;
|
||||
create view TARGET.indi_org_fairness_year as select * from SOURCE.indi_org_fairness_year;
|
||||
create view TARGET.indi_org_findable_year as select * from SOURCE.indi_org_findable_year;
|
||||
create view TARGET.indi_org_findable as select * from SOURCE.indi_org_findable;
|
||||
create view TARGET.indi_org_openess as select * from SOURCE.indi_org_openess;
|
||||
create view TARGET.indi_org_openess_year as select * from SOURCE.indi_org_openess_year;
|
||||
create table TARGET.indi_pub_has_preprint stored as parquet as select * from SOURCE.indi_pub_has_preprint orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_has_preprint COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_in_subscribed stored as parquet as select * from SOURCE.indi_pub_in_subscribed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_in_subscribed COMPUTE STATISTICS;
|
||||
create table TARGET.indi_result_with_pid stored as parquet as select * from SOURCE.indi_result_with_pid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_result_with_pid COMPUTE STATISTICS;
|
||||
create table TARGET.indi_impact_measures stored as parquet as select * from SOURCE.indi_impact_measures orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_impact_measures COMPUTE STATISTICS;
|
||||
create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * from SOURCE.indi_pub_interdisciplinarity orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.indi_pub_interdisciplinarity COMPUTE STATISTICS;
|
||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--ANALYZE TABLE TARGET.result_apc_affiliations COMPUTE STATISTICS;
|
||||
--create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--create table TARGET.indi_is_funder_plan_s stored as parquet as select * from SOURCE.indi_is_funder_plan_s orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
--create view TARGET.indi_funder_fairness as select * from SOURCE.indi_funder_fairness;
|
||||
--create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_openess;
|
||||
--create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
|
||||
--create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
|
||||
--create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
|
||||
--create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
|
|
@ -1,67 +0,0 @@
|
|||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
create table TARGET.result stored as parquet as
|
||||
select distinct * from (
|
||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
'openorgs____::38d7097854736583dde879d12dacafca', -- Brown University
|
||||
'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
'openorgs____::3cff625a4370d51e08624cc586138b2f', -- IMT Atlantique
|
||||
'openorgs____::c0b262bd6eab819e4c994914f9c010e2', -- National Institute of Geophysics and Volcanology
|
||||
'openorgs____::1624ff7c01bb641b91f4518539a0c28a', -- Vrije Universiteit Amsterdam
|
||||
'openorgs____::4d4051b56708688235252f1d8fddb8c1', --Iscte - Instituto Universitário de Lisboa
|
||||
'openorgs____::ab4ac74c35fa5dada770cf08e5110fab', -- Universidade Católica Portuguesa
|
||||
'openorgs____::4d4051b56708688235252f1d8fddb8c1', -- Iscte - Instituto Universitário de Lisboa
|
||||
'openorgs____::5d55fb216b14691cf68218daf5d78cd9', -- Munster Technological University
|
||||
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
|
||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -1,111 +0,0 @@
|
|||
<workflow-app name="Stats Monitor Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
<description>the target stats database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_db_name</name>
|
||||
<description>the target monitor db name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>monitor_db_shadow_name</name>
|
||||
<description>the name of the shadow monitor db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_timeout</name>
|
||||
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hadoop_user_name</name>
|
||||
<description>user name of the wf owner</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>analytics</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="Step1-updateMonitorDB">${wf:conf('resumeFrom') eq 'Step1-updateMonitorDB'}</case>
|
||||
<case to="Step2-copyDataToImpalaCluster">${wf:conf('resumeFrom') eq 'Step2-copyDataToImpalaCluster'}</case>
|
||||
<case to="Step3-finalizeImpalaCluster">${wf:conf('resumeFrom') eq 'Step3-finalizeImpalaCluster'}</case>
|
||||
<default to="Step1-updateMonitorDB"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="Step1-updateMonitorDB">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>monitor.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/updateMonitorDB_institutions.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/updateMonitorDB.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/updateMonitorDBAll.sql</argument>
|
||||
<file>monitor.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step2-copyDataToImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${hadoop_user_name}</argument>
|
||||
<file>copyDataToImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step3-finalizeImpalaCluster">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>finalizeImpalaCluster.sh</exec>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${monitor_db_prod_name}</argument>
|
||||
<argument>${monitor_db_shadow_name}</argument>
|
||||
<file>finalizeImpalaCluster.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -8,11 +8,6 @@
|
|||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-stats-update</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
|
|
|
@ -35,20 +35,12 @@ export HADOOP_USER="oozie"
|
|||
export HADOOP_USER_NAME="oozie"
|
||||
|
||||
echo "Creating and populating impala tables"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context_csv (id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context_csv"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context stored as parquet as select * from ${TARGET_DB}.context_csv"
|
||||
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.context_csv purge"
|
||||
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category_csv (context string, id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category_csv"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category stored as parquet as select * from ${TARGET_DB}.category_csv"
|
||||
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.category_csv purge"
|
||||
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept_csv (category string, id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept_csv"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept stored as parquet as select * from ${TARGET_DB}.concept_csv"
|
||||
hive $HIVE_OPTS -e "drop table ${TARGET_DB}.concept_csv purge"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category"
|
||||
hive $HIVE_OPTS -e "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept"
|
||||
|
||||
echo "Cleaning up"
|
||||
rm concepts.csv
|
||||
|
|
|
@ -6,215 +6,68 @@ then
|
|||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
|
||||
# Set the active HDFS node of OCEAN and IMPALA cluster.
|
||||
OCEAN_HDFS_NODE='hdfs://nameservice1'
|
||||
echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}"
|
||||
|
||||
IMPALA_HDFS_NODE=''
|
||||
COUNTER=0
|
||||
while [ $COUNTER -lt 3 ]; do
|
||||
if hdfs dfs -test -e hdfs://impala-cluster-mn1.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn1.openaire.eu:8020'
|
||||
break
|
||||
elif hdfs dfs -test -e hdfs://impala-cluster-mn2.openaire.eu/tmp >/dev/null 2>&1; then
|
||||
IMPALA_HDFS_NODE='hdfs://impala-cluster-mn2.openaire.eu:8020'
|
||||
break
|
||||
else
|
||||
IMPALA_HDFS_NODE=''
|
||||
sleep 1
|
||||
fi
|
||||
((COUNTER++))
|
||||
done
|
||||
if [ -z "$IMPALA_HDFS_NODE" ]; then
|
||||
echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n"
|
||||
|
||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||
IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml'
|
||||
|
||||
IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse"
|
||||
|
||||
# Set sed arguments.
|
||||
LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs)
|
||||
|
||||
# Set the SED command arguments for column-names with reserved words:
|
||||
DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g'
|
||||
DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing.
|
||||
DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g'
|
||||
|
||||
HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g'
|
||||
HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g'
|
||||
HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g'
|
||||
|
||||
LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g'
|
||||
LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g'
|
||||
LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g'
|
||||
|
||||
|
||||
export HADOOP_USER_NAME=$6
|
||||
export PROD_USAGE_STATS_DB="openaire_prod_usage_stats"
|
||||
|
||||
|
||||
function copydb() {
|
||||
db=$1
|
||||
echo -e "\nStart processing db: '${db}'..\n"
|
||||
FILE=("hive_wf_tmp_"$RANDOM)
|
||||
hdfs dfs -mkdir hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
|
||||
# copy the databases from ocean to impala
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
fi
|
||||
echo "copying $db"
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -pb hdfs://nameservice1/user/hive/warehouse/${db}.db hdfs://impala-cluster-mn1.openaire.eu:8020/tmp/$FILE/
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
# change ownership to impala
|
||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -chmod -R 777 /tmp/$FILE/${db}.db
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
# Check the exit status of the "hadoop distcp" command.
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
#hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
|
||||
|
||||
echo -e "\nCreating schema for db: '${db}'\n"
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command.
|
||||
create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement.
|
||||
|
||||
create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'`
|
||||
if [ -n "$create_view_statement_test" ]; then
|
||||
echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n"
|
||||
create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \
|
||||
| sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \
|
||||
| sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \
|
||||
| sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"`
|
||||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
# drop tables from db
|
||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
||||
do
|
||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table $i;"`;
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
# drop views from db
|
||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
||||
do
|
||||
`impala-shell -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop view $i;"`;
|
||||
done
|
||||
|
||||
# delete the database
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "drop database if exists ${db} cascade";
|
||||
|
||||
# create the databases
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -q "create database ${db}";
|
||||
|
||||
impala-shell --user $HADOOP_USER_NAME -q "INVALIDATE METADATA"
|
||||
echo "creating schema for ${db}"
|
||||
for (( k = 0; k < 5; k ++ )); do
|
||||
for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
||||
do
|
||||
impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
||||
done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
||||
# do
|
||||
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
||||
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
#
|
||||
# # run the same command twice because we may have failures in the first run (due to views pointing to the same db)
|
||||
# for i in `impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show tables"`;
|
||||
# do
|
||||
# impala-shell --user $HADOOP_USER_NAME -d ${db} --delimited -q "show create table $i";
|
||||
# done | sed 's/"$/;/' | sed 's/^"//' | sed 's/[[:space:]]\date[[:space:]]/`date`/g' | impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -c -f -
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
# load the data from /tmp in the respective tables
|
||||
echo "copying data in tables and computing stats"
|
||||
for i in `impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} --delimited -q "show tables"`;
|
||||
do
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "load data inpath '/tmp/$FILE/${db}.db/$i' into table $i";
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "compute stats $i";
|
||||
done
|
||||
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
echo -e "\n\nFinished processing db: ${db}\n\n"
|
||||
# deleting the remaining directory from hdfs
|
||||
hdfs dfs -conf /etc/impala_cluster/hdfs-site.xml -rm -R /tmp/$FILE/${db}.db
|
||||
}
|
||||
|
||||
STATS_DB=$1
|
||||
|
|
|
@ -85,12 +85,12 @@ hive $HIVE_OPTS --database ${2}_funded -e "show tables" | grep -v WARN | sed "s/
|
|||
hive -f foo
|
||||
echo "Updated shadow monitor funded database"
|
||||
|
||||
echo "Updating shadow monitor institutions database"
|
||||
echo "Updating shadow monitor insitutions database"
|
||||
hive -e "drop database if exists ${SHADOW}_institutions cascade"
|
||||
hive -e "create database if not exists ${SHADOW}_institutions"
|
||||
hive $HIVE_OPTS --database ${2}_institutions -e "show tables" | grep -v WARN | sed "s/\(.*\)/create view ${SHADOW}_institutions.\1 as select * from ${2}_institutions.\1;/" > foo
|
||||
hive -f foo
|
||||
echo "Shadow db monitor institutions ready!"
|
||||
echo "Shadow db monitor insitutions ready!"
|
||||
|
||||
echo "Updating shadow monitor RIs database"
|
||||
for i in $contexts
|
||||
|
|
|
@ -69,7 +69,7 @@ SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
|||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||
select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
|
||||
from (
|
||||
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
||||
FROM ${openaire_db_name}.result res
|
||||
|
|
|
@ -7,76 +7,32 @@
|
|||
------------------------------------------------------
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
|
||||
non_peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
|
||||
non_peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
|
||||
non_peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='peerReviewed'),
|
||||
non_peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
||||
select * from ${stats_db_name}.publication_refereed
|
||||
|
@ -104,4 +60,4 @@ rel.properties[1].value apc_currency
|
|||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.organization o on o.id=rel.source
|
||||
join ${openaire_db_name}.result r on r.id=rel.target
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
|
||||
|
|
|
@ -50,13 +50,13 @@ select distinct r.*
|
|||
from (
|
||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view explode(inst.pid) pids as p) r
|
||||
join ${stats_db_name}.result res on res.id=r.id;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
|
||||
|
||||
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
|
||||
select distinct r.id, r.amount, r.currency
|
||||
select r.id, r.amount, r.currency
|
||||
from (
|
||||
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -180,12 +180,4 @@ create view TARGET.indi_funder_openess as select * from SOURCE.indi_funder_opene
|
|||
create view TARGET.indi_funder_findable as select * from SOURCE.indi_funder_findable;
|
||||
create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
|
||||
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
|
||||
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
|
||||
|
||||
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
|
|
@ -80,12 +80,8 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
|
||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna
|
||||
|
||||
) )) foo;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
|
@ -268,11 +264,4 @@ create view TARGET.indi_ris_fairness as select * from SOURCE.indi_ris_fairness;
|
|||
create view TARGET.indi_ris_openess as select * from SOURCE.indi_ris_openess;
|
||||
create view TARGET.indi_ris_findable as select * from SOURCE.indi_ris_findable;
|
||||
|
||||
create table TARGET.indi_pub_green_with_license stored as parquet as select * from SOURCE.indi_pub_green_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_country stored as parquet as select * from SOURCE.result_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_oa_with_license stored as parquet as select * from SOURCE.indi_result_oa_with_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_result_oa_without_license stored as parquet as select * from SOURCE.indi_result_oa_without_license orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_result_under_transformative stored as parquet as select * from SOURCE.indi_result_under_transformative orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
|
|
|
@ -60,10 +60,5 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::0fccc7640f0cb44d5cd1b06b312a06b9', -- Cardiff University
|
||||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f' -- Alma Mater Studiorum University of Bologna
|
||||
))) foo;
|
|
@ -2,8 +2,9 @@ create table ${observatory_db_name}.result_cc_licence stored as parquet as
|
|||
select r.id, coalesce(rln.count, 0) > 0 as cc_licence
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
|
||||
select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count
|
||||
from ${stats_db_name}.result_licenses rl
|
||||
left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license
|
||||
group by rl.id
|
||||
) rln on rln.id=r.id;
|
||||
|
||||
|
|
|
@ -95,8 +95,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.funder purge;
|
|||
create table ${stats_db_name}.funder STORED AS PARQUET as
|
||||
select distinct xpath_string(fund, '//funder/id') as id,
|
||||
xpath_string(fund, '//funder/name') as name,
|
||||
xpath_string(fund, '//funder/shortname') as shortname,
|
||||
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
|
||||
xpath_string(fund, '//funder/shortname') as shortname
|
||||
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge;
|
||||
|
|
|
@ -64,26 +64,6 @@
|
|||
<name>hadoop_user_name</name>
|
||||
<description>user name of the wf owner</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkSqlWarehouseDir</name>
|
||||
</property>
|
||||
<!-- General oozie workflow properties -->
|
||||
<property>
|
||||
<name>sparkClusterOpts</name>
|
||||
<value>--conf spark.network.timeout=600 --conf spark.extraListeners= --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=http://iis-cdh5-test-m3.ocean.icm.edu.pl:18088 --conf spark.eventLog.dir=hdfs://nameservice1/user/spark/applicationHistory</value>
|
||||
<description>spark cluster-wide options</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkResourceOpts</name>
|
||||
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
|
||||
<description>spark resource options</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkApplicationOpts</name>
|
||||
<value>--conf spark.sql.shuffle.partitions=3840</value>
|
||||
<description>spark resource options</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -95,21 +75,17 @@
|
|||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.mapjoin.followby.gby.localtask.max.memory.usage</name>
|
||||
<value>0.80</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>analytics</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>analytics</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
|
@ -157,164 +133,164 @@
|
|||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step1.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step2">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step2.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step3">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step3.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step4">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step4.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step5">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step5.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step6">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step6.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step7"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step7">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step7.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step8"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step8">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step8.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step9">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step9.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step10"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step10">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step10.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step11"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
</action>
|
||||
|
||||
<action name="Step11">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step11.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step12"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
</action>
|
||||
|
||||
<action name="Step12">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step12.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step13"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step13">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step13.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step14"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step14">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step14.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step15"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="Step15">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step15.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step15_5"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -345,35 +321,13 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="Step16-createIndicatorsTables">-->
|
||||
<!-- <hive2 xmlns="uri:oozie:hive2-action:0.1">-->
|
||||
<!-- <jdbc-url>${hive_jdbc_url}</jdbc-url>-->
|
||||
<!-- <script>scripts/step16-createIndicatorsTables.sql</script>-->
|
||||
<!-- <param>stats_db_name=${stats_db_name}</param>-->
|
||||
<!-- <param>external_stats_db_name=${external_stats_db_name}</param>-->
|
||||
<!-- </hive2>-->
|
||||
<!-- <ok to="Step16_1-definitions"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<action name="Step16-createIndicatorsTables">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Step16-createIndicatorsTables</name>
|
||||
<class>eu.dnetlib.dhp.oozie.RunSQLSparkJob</class>
|
||||
<jar>dhp-stats-update-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
</spark-opts>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
|
||||
<arg>--stats_db_name</arg><arg>${stats_db_name}</arg>
|
||||
<arg>--external_stats_db_name</arg><arg>${external_stats_db_name}</arg>
|
||||
</spark>
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step16-createIndicatorsTables.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>external_stats_db_name=${external_stats_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step16_1-definitions"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -433,18 +387,18 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- <action name="step20-createMonitorDB-post">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>monitor-post.sh</exec>-->
|
||||
<!-- <argument>${monitor_db_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_shadow_name}</argument>-->
|
||||
<!-- <file>monitor-post.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
<!-- <action name="step20-createMonitorDB-post">-->
|
||||
<!-- <shell xmlns="uri:oozie:shell-action:0.1">-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <exec>monitor-post.sh</exec>-->
|
||||
<!-- <argument>${monitor_db_name}</argument>-->
|
||||
<!-- <argument>${monitor_db_shadow_name}</argument>-->
|
||||
<!-- <file>monitor-post.sh</file>-->
|
||||
<!-- </shell>-->
|
||||
<!-- <ok to="step21-createObservatoryDB-pre"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<action name="step21-createObservatoryDB-pre">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
|
@ -489,8 +443,8 @@
|
|||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>copyDataToImpalaCluster.sh</exec>
|
||||
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
|
||||
<!-- <argument>${external_stats_db_name}</argument>-->
|
||||
<!-- <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>-->
|
||||
<!-- <argument>${external_stats_db_name}</argument>-->
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${monitor_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
|
@ -551,4 +505,4 @@
|
|||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
||||
</workflow-app>
|
||||
|
|
|
@ -31,10 +31,6 @@
|
|||
<module>dhp-enrichment</module>
|
||||
<module>dhp-graph-provision</module>
|
||||
<module>dhp-blacklist</module>
|
||||
<module>dhp-stats-actionsets</module>
|
||||
<module>dhp-stats-hist-snaps</module>
|
||||
<module>dhp-stats-monitor-irish</module>
|
||||
<module>dhp-stats-monitor-update</module>
|
||||
<module>dhp-stats-update</module>
|
||||
<module>dhp-stats-promote</module>
|
||||
<module>dhp-usage-stats-build</module>
|
||||
|
|
Loading…
Reference in New Issue