Compare commits
61 Commits
main
...
affiliatio
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 420f43fc2f | |
Miriam Baglioni | 595883fef0 | |
Miriam Baglioni | f8988af98d | |
Giambattista Bloisi | 56b05cde0b | |
Claudio Atzori | 62ff843334 | |
Claudio Atzori | d5867a1992 | |
Claudio Atzori | e5df68772d | |
Miriam Baglioni | 7e6d12fa77 | |
Miriam Baglioni | 191fc3a461 | |
Claudio Atzori | 10696f2a44 | |
Claudio Atzori | 5734b80861 | |
Antonis Lempesis | f3c179658a | |
Miriam Baglioni | b18ad035c1 | |
Miriam Baglioni | e430826e00 | |
Giambattista Bloisi | c45cae447a | |
Claudio Atzori | 3fcafc7ed6 | |
Miriam Baglioni | 599e56dbc6 | |
Claudio Atzori | 6397141e56 | |
Claudio Atzori | e354f9853a | |
Claudio Atzori | 535a7b99f1 | |
Sandro La Bruzzo | 6a097abc89 | |
Michele Artini | 9754521847 | |
Michele Artini | 54f8b4da39 | |
Miriam Baglioni | 4d3e079590 | |
Michele Artini | e941adbe2b | |
Michele Artini | fdbe629f49 | |
Antonis Lempesis | 619aa34a15 | |
Antonis Lempesis | dbea7a4072 | |
Antonis Lempesis | c9241dba0d | |
Michele Artini | 755a5aefcf | |
Michele Artini | db6f137cf9 | |
Serafeim Chatzopoulos | 50401a872f | |
Antonis Lempesis | 37ad259296 | |
Antonis Lempesis | b64c144abf | |
Serafeim Chatzopoulos | 37c04cbad7 | |
Miriam Baglioni | 468f2aa5a5 | |
Miriam Baglioni | 89fcf4086c | |
Miriam Baglioni | 8c185a7b1a | |
Miriam Baglioni | 985ca15264 | |
Antonis Lempesis | d0590e0e49 | |
Antonis Lempesis | 7d2c0a3723 | |
Lampros Smyrnaios | e9686365a2 | |
Lampros Smyrnaios | ce0aee21cc | |
Lampros Smyrnaios | 7b7dd32ad5 | |
Lampros Smyrnaios | 7ce051d766 | |
Lampros Smyrnaios | aa4d7d5e20 | |
Lampros Smyrnaios | 54e11b6a43 | |
Lampros Smyrnaios | fe2275a9b0 | |
Lampros Smyrnaios | a644a6f4fe | |
Lampros Smyrnaios | 888637773c | |
Lampros Smyrnaios | e0ac494859 | |
Lampros Smyrnaios | 3c17183d10 | |
Lampros Smyrnaios | 69a9ac7393 | |
Lampros Smyrnaios | 342223f75c | |
Lampros Smyrnaios | 2616971e2b | |
Lampros Smyrnaios | ba533d9f34 | |
Lampros Smyrnaios | d46b78b659 | |
Lampros Smyrnaios | 6f2ebb2a52 | |
Lampros Smyrnaios | ca091c0f1e | |
Lampros Smyrnaios | 0b897f2f66 | |
Lampros Smyrnaios | db33f7727c |
|
@ -212,11 +212,11 @@ public class HttpConnector2 {
|
|||
.format(
|
||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||
MAPPER.writeValueAsString(report)));
|
||||
} catch (MalformedURLException | UnknownHostException e) {
|
||||
} catch (MalformedURLException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e.getMessage(), e);
|
||||
} catch (SocketTimeoutException | SocketException e) {
|
||||
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||
|
|
|
@ -65,7 +65,13 @@ public class RunSQLSparkJob {
|
|||
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
||||
log.info("executing: {}", statement);
|
||||
long startTime = System.currentTimeMillis();
|
||||
spark.sql(statement).show();
|
||||
try {
|
||||
spark.sql(statement).show();
|
||||
} catch (Exception e) {
|
||||
log.error("Error executing statement: {}", statement, e);
|
||||
System.err.println("Error executing statement: " + statement + "\n" + e);
|
||||
throw e;
|
||||
}
|
||||
log
|
||||
.info(
|
||||
"executed in {}",
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2024.
|
||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
||||
import org.apache.commons.lang3.builder.HashCodeBuilder;
|
||||
|
||||
public class HashableStructuredProperty extends StructuredProperty {
|
||||
|
||||
private static final long serialVersionUID = 8371670185221126045L;
|
||||
|
||||
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final HashableStructuredProperty sp = new HashableStructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
|
||||
HashableStructuredProperty hsp = new HashableStructuredProperty();
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
hsp.setValue(sp.getValue());
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
return hsp;
|
||||
}
|
||||
|
||||
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
sp.setValue(hsp.getValue());
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
return sp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return new HashCodeBuilder(11, 91)
|
||||
.append(getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid())
|
||||
.append(getValue())
|
||||
.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (obj.getClass() != getClass()) {
|
||||
return false;
|
||||
}
|
||||
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
|
||||
return new EqualsBuilder()
|
||||
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
|
||||
.append(getValue(), rhs.getValue())
|
||||
.isEquals();
|
||||
}
|
||||
}
|
|
@ -43,34 +43,4 @@ public class CleaningFunctions {
|
|||
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
pid
|
||||
.setValue(
|
||||
normalizePidValue(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()));
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
public static String normalizePidValue(String pidType, String pidValue) {
|
||||
String value = Optional
|
||||
.ofNullable(pidValue)
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
|
||||
switch (pidType) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,18 +6,11 @@ import org.apache.commons.lang3.StringUtils;
|
|||
public class DoiCleaningRule {
|
||||
|
||||
public static String clean(final String doi) {
|
||||
return doi
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
}
|
||||
|
||||
public static String normalizeDoi(final String input) {
|
||||
if (input == null)
|
||||
if (doi == null)
|
||||
return null;
|
||||
final String replaced = input
|
||||
final String replaced = doi
|
||||
.replaceAll("\\n|\\r|\\t|\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.toLowerCase()
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
if (StringUtils.isEmpty(replaced))
|
||||
|
@ -32,7 +25,6 @@ public class DoiCleaningRule {
|
|||
return null;
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -563,12 +563,24 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(pid -> {
|
||||
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
|
||||
final Set<HashableStructuredProperty> pids = pid
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(altId -> {
|
||||
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
|
||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||
final Set<HashableStructuredProperty> altIds = altId
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
i
|
||||
.setAlternateIdentifier(
|
||||
Sets
|
||||
.difference(altIds, pids)
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::toStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
|
|||
return entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(
|
||||
Collectors
|
||||
|
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
|
|||
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||
// given PID Type
|
||||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
|
||||
.filter(CleaningFunctions::pidFilter))
|
||||
.orElse(Stream.empty());
|
||||
|
|
|
@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator<Oaf> {
|
|||
// id
|
||||
if (res == 0) {
|
||||
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
|
||||
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -972,7 +972,7 @@ public class MergeUtils {
|
|||
private static String extractKeyFromPid(final StructuredProperty pid) {
|
||||
if (pid == null)
|
||||
return null;
|
||||
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
|
||||
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid);
|
||||
|
||||
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
|
||||
}
|
||||
|
|
|
@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
|
|||
if (right == null)
|
||||
return -1;
|
||||
|
||||
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
||||
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
||||
StructuredProperty l = PidCleaner.normalizePidValue(left);
|
||||
StructuredProperty r = PidCleaner.normalizePidValue(right);
|
||||
|
||||
return Optional
|
||||
.ofNullable(l.getValue())
|
||||
|
|
|
@ -28,6 +28,7 @@ import com.jayway.jsonpath.JsonPath;
|
|||
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import net.minidev.json.JSONArray;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.Seq;
|
||||
|
@ -104,7 +105,7 @@ public class DHPUtils {
|
|||
|
||||
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
||||
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
|
||||
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
|
||||
|
||||
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ class IdentifierFactoryTest {
|
|||
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
||||
|
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
|
|||
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
"value": "PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
{
|
||||
"qualifier":{"classid":"pmc"},
|
||||
"value":"21459329"
|
||||
"value":"PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
|||
// function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
|
||||
// for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
|
|
@ -9,11 +9,8 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||
|
||||
private final boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeStats(boolean ignoreUndefined) {
|
||||
public TreeNodeStats() {
|
||||
this.results = new HashMap<>();
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public Map<String, FieldStats> getResults() {
|
||||
|
@ -25,10 +22,7 @@ public class TreeNodeStats implements Serializable {
|
|||
}
|
||||
|
||||
public int fieldsCount() {
|
||||
if (ignoreUndefined)
|
||||
return this.results.size();
|
||||
else
|
||||
return this.results.size() - undefinedCount(); // do not count undefined
|
||||
return this.results.size();
|
||||
}
|
||||
|
||||
public int undefinedCount() {
|
||||
|
@ -84,22 +78,11 @@ public class TreeNodeStats implements Serializable {
|
|||
double min = 100.0; // random high value
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() < min) {
|
||||
if (fs.getResult() == -1) {
|
||||
if (fs.isCountIfUndefined()) {
|
||||
min = 0.0;
|
||||
} else {
|
||||
min = -1;
|
||||
}
|
||||
} else {
|
||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ignoreUndefined) {
|
||||
return min == -1.0 ? 0.0 : min;
|
||||
} else {
|
||||
return min;
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
// if at least one is true, return 1.0
|
||||
|
@ -108,11 +91,7 @@ public class TreeNodeStats implements Serializable {
|
|||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
if (!ignoreUndefined && undefinedCount() > 0) {
|
||||
return -1.0;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// if at least one is false, return 0.0
|
||||
|
@ -121,7 +100,7 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return ignoreUndefined ? 0.0 : -1.0;
|
||||
return 0.0;
|
||||
} else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
|
|
|
@ -44,10 +44,12 @@ public class TreeProcessor {
|
|||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
treeStats.addNodeStats(nextNodeName, stats);
|
||||
|
||||
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||
if (finalScore == -1.0)
|
||||
// if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||
nextNodeName = currentNode.getUndefined();
|
||||
else if (finalScore >= currentNode.getThreshold()) {
|
||||
}
|
||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
nextNodeName = currentNode.getPositive();
|
||||
} else {
|
||||
nextNodeName = currentNode.getNegative();
|
||||
|
|
|
@ -6,7 +6,18 @@
|
|||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
<properties>
|
||||
<affro.release.version>1.0.0</affro.release.version>
|
||||
</properties>
|
||||
|
||||
<scm>
|
||||
<url>https://code-repo.d4science.org/mkallipo/affRo</url>
|
||||
<connection>scm:git:https://code-repo.d4science.org/mkallipo/affRo.git</connection>
|
||||
</scm>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -43,6 +54,32 @@
|
|||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-scm-plugin</artifactId>
|
||||
<version>1.8.1</version>
|
||||
<configuration>
|
||||
<connectionType>connection</connectionType>
|
||||
<!--
|
||||
<scmVersionType>tag</scmVersionType>--><!-- 'branch' can also be provided here -->
|
||||
<!-- <scmVersion>${affro.release.version}</scmVersion>--><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||
|
||||
<scmVersionType>branch</scmVersionType><!-- 'branch' can also be provided here -->
|
||||
<scmVersion>openaire-workflow-ready</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
|
||||
<checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/affRo</checkoutDirectory>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>checkout-affro</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>checkout</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
|
|
@ -10,7 +10,6 @@ import java.util.List;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
|
@ -29,6 +28,7 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
@ -44,6 +44,10 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
||||
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
||||
public static final String DOI_URL_PREFIX = "https://doi.org/";
|
||||
public static final int DOI_URL_PREFIX_LENGTH = 16;
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -74,6 +78,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||
|
||||
final String publisherInputPath = parser.get("publisherInputPath");
|
||||
log.info("publisherInputPath: {}", publisherInputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
|
@ -84,43 +91,74 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Constants.removeOutputDir(spark, outputPath);
|
||||
|
||||
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||
spark, crossrefInputPath, collectedFromCrossref);
|
||||
|
||||
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedFromPubmed);
|
||||
|
||||
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||
spark, openapcInputPath, collectedFromOpenAPC);
|
||||
|
||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
|
||||
createActionSet(
|
||||
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
|
||||
publisherInputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
|
||||
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
|
||||
String outputPath) {
|
||||
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
|
||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE);
|
||||
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE);
|
||||
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, openapcInputPath, collectedfromOpenAIRE);
|
||||
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE);
|
||||
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE);
|
||||
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.union(publisherRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema(
|
||||
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
|
@ -132,6 +170,24 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df);
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema(
|
||||
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDDNew(collectedfrom, df);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
|
@ -147,7 +203,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
||||
|
||||
// ROR id to OpenAIRE id
|
||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||
|
@ -179,6 +235,69 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
.select(
|
||||
new Column("DOI").as("doi"),
|
||||
new Column("matching.PID").as("pidtype"),
|
||||
new Column("matching.Value").as("pidvalue"),
|
||||
new Column("matching.Confidence").as("confidence"),
|
||||
new Column("matching.Status").as("status"))
|
||||
.where("status = 'active'");
|
||||
|
||||
// prepare action sets for affiliation relations
|
||||
return df
|
||||
.toJavaRDD()
|
||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
||||
|
||||
// Organization to OpenAIRE identifier
|
||||
String affId = null;
|
||||
if (row.getAs("pidtype").equals("ROR"))
|
||||
// ROR id to OpenIARE id
|
||||
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
|
||||
else
|
||||
// getting the OpenOrgs identifier for the organization
|
||||
affId = row.getAs("pidvalue");
|
||||
|
||||
Qualifier qualifier = OafMapperUtils
|
||||
.qualifier(
|
||||
BIP_AFFILIATIONS_CLASSID,
|
||||
BIP_AFFILIATIONS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
// format data info; setting `confidence` into relation's `trust`
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
Double.toString(row.getAs("confidence")));
|
||||
|
||||
// return bi-directional relations
|
||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
||||
|
||||
})
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static String removePrefix(String doi) {
|
||||
if (doi.startsWith(DOI_URL_PREFIX))
|
||||
return doi.substring(DOI_URL_PREFIX_LENGTH);
|
||||
return doi;
|
||||
}
|
||||
|
||||
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
||||
DataInfo dataInfo) {
|
||||
return Arrays
|
||||
|
|
|
@ -46,6 +46,9 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String backupPath = parser.get("backupPath");
|
||||
log.info("backupPath {}", backupPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
|
@ -53,11 +56,11 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
|
||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||
|
||||
ocr.doExtract(inputPath, outputPath, fileSystem);
|
||||
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
|
||||
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
|
||||
throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
|
@ -89,6 +92,7 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
}
|
||||
|
||||
}
|
||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -107,7 +107,8 @@ public class ReadCOCI implements Serializable {
|
|||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
|
||||
|
||||
fileSystem.delete(fileStatus.getPath());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -297,7 +297,7 @@ public class ExtractPerson implements Serializable {
|
|||
}
|
||||
|
||||
private static Relation getAffiliationRelation(Employment row) {
|
||||
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid());
|
||||
String target = ROR_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
|
||||
List<KeyValue> properties = new ArrayList<>();
|
||||
|
|
|
@ -46,11 +46,11 @@ public class CollectorWorker extends ReportingJob {
|
|||
private final HttpClientParams clientParams;
|
||||
|
||||
public CollectorWorker(
|
||||
final ApiDescriptor api,
|
||||
final FileSystem fileSystem,
|
||||
final MDStoreVersion mdStoreVersion,
|
||||
final HttpClientParams clientParams,
|
||||
final AggregatorReport report) {
|
||||
final ApiDescriptor api,
|
||||
final FileSystem fileSystem,
|
||||
final MDStoreVersion mdStoreVersion,
|
||||
final HttpClientParams clientParams,
|
||||
final AggregatorReport report) {
|
||||
super(report);
|
||||
this.api = api;
|
||||
this.fileSystem = fileSystem;
|
||||
|
@ -69,22 +69,25 @@ public class CollectorWorker extends ReportingJob {
|
|||
scheduleReport(counter);
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(IntWritable.class), SequenceFile.Writer
|
||||
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
.createWriter(
|
||||
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
|
||||
.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer
|
||||
.valueClass(Text.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(this.api, this.report)
|
||||
.forEach(content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
.collect(this.api, this.report)
|
||||
.forEach(content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (final Throwable e) {
|
||||
this.report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e);
|
||||
|
@ -112,36 +115,36 @@ public class CollectorWorker extends ReportingJob {
|
|||
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
|
||||
|
||||
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) {
|
||||
case oai:
|
||||
return new OaiCollectorPlugin(this.clientParams);
|
||||
case rest_json2xml:
|
||||
return new RestCollectorPlugin(this.clientParams);
|
||||
case file:
|
||||
return new FileCollectorPlugin(this.fileSystem);
|
||||
case fileGzip:
|
||||
return new FileGZipCollectorPlugin(this.fileSystem);
|
||||
case baseDump:
|
||||
return new BaseCollectorPlugin(this.fileSystem);
|
||||
case gtr2Publications:
|
||||
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
|
||||
case osfPreprints:
|
||||
return new OsfPreprintsCollectorPlugin(this.clientParams);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
case oai:
|
||||
return new OaiCollectorPlugin(this.clientParams);
|
||||
case rest_json2xml:
|
||||
return new RestCollectorPlugin(this.clientParams);
|
||||
case file:
|
||||
return new FileCollectorPlugin(this.fileSystem);
|
||||
case fileGzip:
|
||||
return new FileGZipCollectorPlugin(this.fileSystem);
|
||||
case baseDump:
|
||||
return new BaseCollectorPlugin(this.fileSystem);
|
||||
case gtr2Publications:
|
||||
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
|
||||
case osfPreprints:
|
||||
return new OsfPreprintsCollectorPlugin(this.clientParams);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
.ofNullable(this.api.getParams().get("other_plugin_type"))
|
||||
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
|
||||
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
|
||||
|
||||
switch (plugin) {
|
||||
case mdstore_mongodb_dump:
|
||||
return new MongoDbDumpCollectorPlugin(this.fileSystem);
|
||||
case mdstore_mongodb:
|
||||
return new MDStoreCollectorPlugin();
|
||||
switch (plugin) {
|
||||
case mdstore_mongodb_dump:
|
||||
return new MongoDbDumpCollectorPlugin(this.fileSystem);
|
||||
case mdstore_mongodb:
|
||||
return new MDStoreCollectorPlugin();
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||
}
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
|
||||
}
|
||||
default:
|
||||
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
|
||||
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -31,17 +31,19 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
|||
final String baseUrl = api.getBaseUrl();
|
||||
|
||||
final int pageSize = Optional
|
||||
.ofNullable(api.getParams().get("pageSize"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||
.ofNullable(api.getParams().get("pageSize"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); }
|
||||
if (StringUtils.isBlank(baseUrl)) {
|
||||
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||
}
|
||||
|
||||
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
||||
|
||||
return StreamSupport
|
||||
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
|
|
|
@ -34,9 +34,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||
|
||||
public OsfPreprintsIterator(
|
||||
final String baseUrl,
|
||||
final int pageSize,
|
||||
final HttpClientParams clientParams) {
|
||||
final String baseUrl,
|
||||
final int pageSize,
|
||||
final HttpClientParams clientParams) {
|
||||
|
||||
this.clientParams = clientParams;
|
||||
this.baseUrl = baseUrl;
|
||||
|
@ -54,7 +54,8 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (this.recordQueue) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl) && this.currentUrl.startsWith("http")) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
|
||||
&& this.currentUrl.startsWith("http")) {
|
||||
try {
|
||||
this.currentUrl = downloadPage(this.currentUrl);
|
||||
} catch (final CollectorException e) {
|
||||
|
@ -63,7 +64,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
}
|
||||
|
||||
if (!this.recordQueue.isEmpty()) { return true; }
|
||||
if (!this.recordQueue.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -112,7 +115,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
|
||||
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
||||
if (attempt > MAX_ATTEMPTS) {
|
||||
throw new CollectorException("Max Number of attempts reached, url:" + url);
|
||||
}
|
||||
|
||||
if (attempt > 0) {
|
||||
final int delay = (attempt * 5000);
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
|
||||
# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
|
||||
# dhp.hadoop.frontend.user.name=ilias.kanellos
|
||||
# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
|
||||
# dhp.hadoop.frontend.port.ssh=22
|
||||
# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
|
||||
# jobTracker=yarnRM
|
||||
# nameNode=hdfs://nameservice1
|
||||
# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
|
||||
# maven.executable=mvn
|
||||
|
||||
|
||||
# The above is given differently in an example I found online
|
||||
oozie.action.sharelib.for.spark=spark2
|
||||
oozieActionShareLibForSpark2=spark2
|
||||
spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
|
||||
spark2EventLogDir=/user/spark/spark2ApplicationHistory
|
||||
sparkSqlWarehouseDir=/user/hive/warehouse
|
||||
#hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
|
||||
# This MAY avoid the no library used error
|
||||
oozie.use.system.libpath=true
|
||||
# Some stuff copied from openaire's jobs
|
||||
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
|
||||
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
|
||||
|
||||
# The following is needed as a property of a workflow
|
||||
wfAppPath=${oozieTopWfApplicationPath}
|
||||
|
||||
resumeFrom=Crossref
|
||||
|
||||
#OpenAlex input/output
|
||||
#resultFolder=/tmp/affro-results/oalex
|
||||
#inputFolder=/user/zeppelin/affiliations/raw_aff_string/2024-08
|
||||
|
||||
#Crossref input/output
|
||||
resultFolder=/tmp/affro-results/crossref
|
||||
inputFolder=/data/doiboost/crossref/crossref_unpack
|
||||
|
||||
#
|
||||
#crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||
#pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||
#openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||
#dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||
#
|
||||
#outputPath=/tmp/crossref-affiliations-output-v5
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,176 @@
|
|||
<workflow-app name="AffroAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resumeFrom"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<decision name="resumeFrom">
|
||||
<switch>
|
||||
<case to="run-affro-on-iisdata">${wf:conf('resumeFrom') eq 'IIS'}</case>
|
||||
<case to="run-affro-on-crossref">${wf:conf('resumeFrom') eq 'Crossref'}</case>
|
||||
<default to="run-affro-on-oalexstrings"/>
|
||||
</switch>
|
||||
</decision>
|
||||
<action name="run-affro-on-iisdata">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>update_records.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/affro_test_example.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/update_records.py#update_records.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<action name="run-affro-on-oalexstrings">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>strings.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${inputFolder}</arg>
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/strings.py#strings.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<action name="run-affro-on-crossref">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Affiliations inference (Affro)</name>
|
||||
<jar>crossref.py</jar>
|
||||
|
||||
<spark-opts>
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=15G
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.sql.shuffle.partitions=20000
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=python3
|
||||
--conf spark.executorEnv.PYSPARK_PYTHON=python3
|
||||
--py-files ${wfAppPath}/affRo/affro_cluster.py,${wfAppPath}/affRo/create_input_cluster.py,${wfAppPath}/affRo/functions_cluster.py,${wfAppPath}/affRo/matching_cluster.py
|
||||
--files ${wfAppPath}/affRo/dictionaries/dix_acad.json,${wfAppPath}/affRo/dictionaries/dix_categ.json,${wfAppPath}/affRo/dictionaries/dix_city.json,${wfAppPath}/affRo/dictionaries/dix_country.json,${wfAppPath}/affRo/dictionaries/dix_mult.json,${wfAppPath}/affRo/dictionaries/dix_status.json,${wfAppPath}/affRo/txt_files/city_names.txt,${wfAppPath}/affRo/txt_files/remove_list.txt,${wfAppPath}/affRo/txt_files/stop_words.txt,${wfAppPath}/affRo/txt_files/university_terms.txt
|
||||
</spark-opts>
|
||||
|
||||
<arg>${inputFolder}</arg>
|
||||
<arg>${resultFolder}</arg>
|
||||
|
||||
<file>${wfAppPath}/affRo/crossref.py#crossref.py</file>
|
||||
</spark>
|
||||
|
||||
<ok to="End" />
|
||||
<error to="Kill" />
|
||||
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -28,13 +28,19 @@
|
|||
"paramLongName": "dataciteInputPath",
|
||||
"paramDescription": "the path to get the input data from Datacite",
|
||||
"paramRequired": true
|
||||
},{
|
||||
},
|
||||
{
|
||||
"paramName": "wip",
|
||||
"paramLongName": "webCrawlInputPath",
|
||||
"paramDescription": "the path to get the input data from Web Crawl",
|
||||
"paramRequired": true
|
||||
}
|
||||
,
|
||||
},
|
||||
{
|
||||
"paramName": "pub",
|
||||
"paramLongName": "publisherInputPath",
|
||||
"paramDescription": "the path to get the input data from publishers",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -16,5 +16,11 @@
|
|||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bp",
|
||||
"paramLongName": "backupPath",
|
||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -24,12 +24,13 @@
|
|||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}, {
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -94,17 +94,7 @@
|
|||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extract_correspondence">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
|
||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -332,7 +332,7 @@ case object Crossref2Oaf {
|
|||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
//MAPPING Crossref DOI into PID
|
||||
val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String])
|
||||
val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String])
|
||||
result.setPid(
|
||||
List(
|
||||
structuredProperty(
|
||||
|
@ -504,6 +504,24 @@ case object Crossref2Oaf {
|
|||
)
|
||||
}
|
||||
|
||||
val is_review = json \ "relation" \ "is-review-of" \ "id"
|
||||
|
||||
if (is_review != JNothing) {
|
||||
instance.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0015",
|
||||
"peerReviewed",
|
||||
ModelConstants.DNET_REVIEW_LEVELS,
|
||||
ModelConstants.DNET_REVIEW_LEVELS
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
|
||||
instance.setHostedby(
|
||||
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
|
||||
)
|
||||
|
||||
instance.setAccessright(
|
||||
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||
)
|
||||
|
@ -655,7 +673,7 @@ case object Crossref2Oaf {
|
|||
val doi = input.getString(0)
|
||||
val rorId = input.getString(1)
|
||||
|
||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
|
||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
|
||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||
|
||||
val r: Relation = new Relation
|
||||
|
|
|
@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
|
|||
)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(
|
||||
a.setRawAffiliationString(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
|
|
|
@ -28,8 +28,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class PrepareAffiliationRelationsTest {
|
||||
|
||||
|
@ -39,8 +39,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
private static Path workingDir;
|
||||
private static final String ID_PREFIX = "50|doi_________::";
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
@ -74,21 +73,34 @@ public class PrepareAffiliationRelationsTest {
|
|||
@Test
|
||||
void testMatch() throws Exception {
|
||||
|
||||
String crossrefAffiliationRelationPath = getClass()
|
||||
String crossrefAffiliationRelationPathNew = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||
.getPath();
|
||||
|
||||
String crossrefAffiliationRelationPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json")
|
||||
.getPath();
|
||||
|
||||
String publisherAffiliationRelationPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
||||
.getPath();
|
||||
|
||||
String publisherAffiliationRelationOldPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
|
||||
.getPath();
|
||||
|
||||
String outputPath = workingDir.toString() + "/actionSet";
|
||||
|
||||
PrepareAffiliationRelations
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-crossrefInputPath", crossrefAffiliationRelationPath,
|
||||
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||
"-publisherInputPath", publisherAffiliationRelationOldPath,
|
||||
"-outputPath", outputPath
|
||||
});
|
||||
|
||||
|
@ -99,13 +111,8 @@ public class PrepareAffiliationRelationsTest {
|
|||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
// for (Relation r : tmp.collect()) {
|
||||
// System.out.println(
|
||||
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
|
||||
// );
|
||||
// }
|
||||
// count the number of relations
|
||||
assertEquals(120, tmp.count());
|
||||
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -116,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
75, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -124,26 +131,56 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
75, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
.size());
|
||||
|
||||
// check confidence value of a specific relation
|
||||
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
|
||||
String sourceDOI = "10.1089/10872910260066679";
|
||||
|
||||
final String sourceOpenaireId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.7071067812", execVerification
|
||||
"1.0", execVerification
|
||||
.filter(
|
||||
"source='" + sourceOpenaireId + "'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(4));
|
||||
|
||||
final String publisherid = ID_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679"));
|
||||
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891");
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, execVerification
|
||||
.filter(
|
||||
"source = '" + ID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
|
||||
+ "' and target = '" + "20|ror_________::"
|
||||
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, execVerification
|
||||
.filter(
|
||||
"source = '" + ID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
|
||||
+ "' and target = '" + "20|ror_________::"
|
||||
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
|
||||
.count());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class CreateOpenCitationsASTest {
|
||||
|
||||
|
@ -280,17 +281,17 @@ public class CreateOpenCitationsASTest {
|
|||
@Test
|
||||
void testRelationsSourceTargetCouple() throws Exception {
|
||||
final String doi1 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
final String doi2 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
final String doi3 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
final String doi4 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
final String doi5 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
final String doi6 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
|
|
|
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
|
@ -270,17 +271,17 @@ public class CreateTAActionSetTest {
|
|||
@Test
|
||||
void testRelationsSourceTargetCouple() throws Exception {
|
||||
final String doi1 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
final String doi2 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
final String doi3 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
final String doi4 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
final String doi5 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
final String doi6 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
|
|
|
@ -50,9 +50,10 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
@Test
|
||||
@Disabled
|
||||
void test_one() throws CollectorException {
|
||||
this.plugin.collect(this.api, new AggregatorReport())
|
||||
.limit(1)
|
||||
.forEach(log::info);
|
||||
this.plugin
|
||||
.collect(this.api, new AggregatorReport())
|
||||
.limit(1)
|
||||
.forEach(log::info);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -95,7 +96,8 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
final HttpConnector2 connector = new HttpConnector2();
|
||||
|
||||
try {
|
||||
final String res = connector.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
final String res = connector
|
||||
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
System.out.println(res);
|
||||
fail();
|
||||
} catch (final Throwable e) {
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
||||
{"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]}
|
||||
{"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}
|
|
@ -0,0 +1,9 @@
|
|||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
|
@ -0,0 +1,6 @@
|
|||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
@ -0,0 +1,6 @@
|
|||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03bea9k73", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04agmb972", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
|
|
@ -70,9 +70,8 @@ public class PrepareRelatedProjectsJob {
|
|||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.filter((FilterFunction<Relation>) r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType()))
|
||||
.filter((FilterFunction<Relation>) r -> !BrokerConstants.IS_MERGED_IN_CLASS.equals(r.getRelClass()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
|||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getPids()
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.filter(s -> !existingSubjects.contains(subjectAsString(s)))
|
||||
.collect(Collectors.toList());
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
|
||||
public class EnrichMoreSubjectTest {
|
||||
|
||||
final EnrichMoreSubject matcher = new EnrichMoreSubject();
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_1() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_2() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
||||
assertEquals(1, list.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_3() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_4() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
||||
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
|
||||
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
}
|
|
@ -313,7 +313,7 @@ case object ConversionUtil {
|
|||
if (f.author.DisplayName.isDefined)
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
||||
a.setPid(
|
||||
List(
|
||||
createSP(
|
||||
|
@ -386,7 +386,7 @@ case object ConversionUtil {
|
|||
a.setFullname(f.author.DisplayName.get)
|
||||
|
||||
if (f.affiliation != null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setRawAffiliationString(List(f.affiliation).asJava)
|
||||
|
||||
a.setPid(
|
||||
List(
|
||||
|
|
|
@ -9,10 +9,7 @@ import java.util.Optional;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -25,8 +22,6 @@ public class GraphHiveTableImporterJob {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -74,7 +69,12 @@ public class GraphHiveTableImporterJob {
|
|||
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
||||
Class<T> clazz, int numPartitions) {
|
||||
|
||||
Dataset<String> dataset = spark.read().textFile(inputPath);
|
||||
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
|
||||
|
||||
Dataset<Row> dataset = spark
|
||||
.read()
|
||||
.schema(clazzEncoder.schema())
|
||||
.json(inputPath);
|
||||
|
||||
if (numPartitions > 0) {
|
||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||
|
@ -82,7 +82,6 @@ public class GraphHiveTableImporterJob {
|
|||
}
|
||||
|
||||
dataset
|
||||
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(tableIdentifier(hiveDbName, clazz));
|
||||
|
|
|
@ -55,29 +55,7 @@ import eu.dnetlib.dhp.common.Constants;
|
|||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.EoscIfGuidelines;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.InstanceTypeMapping;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
|
@ -667,22 +645,25 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return this.vocs.getTermAsQualifier(schemeId, classId);
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructPropsWithValidQualifier(
|
||||
protected List<HashableStructuredProperty> prepareListStructPropsWithValidQualifier(
|
||||
final Node node,
|
||||
final String xpath,
|
||||
final String xpathClassId,
|
||||
final String schemeId,
|
||||
final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
final Set<HashableStructuredProperty> res = new HashSet<>();
|
||||
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
final String classId = n.valueOf(xpathClassId).trim();
|
||||
if (this.vocs.termExists(schemeId, classId)) {
|
||||
res.add(structuredProperty(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
|
||||
res
|
||||
.add(
|
||||
HashableStructuredProperty
|
||||
.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
return Lists.newArrayList(res);
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(
|
||||
|
|
|
@ -25,6 +25,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -380,7 +381,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
return prepareListStructPropsWithValidQualifier(
|
||||
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
|
||||
.stream()
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import eu.dnetlib.dhp.schema.common.RelationInverse;
|
|||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -93,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
|
||||
}
|
||||
|
||||
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
|
||||
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
|
||||
author.setPid(preparePids(n, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
|
@ -504,7 +505,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
||||
final Set<StructuredProperty> res = new HashSet<>();
|
||||
final Set<HashableStructuredProperty> res = new HashSet<>();
|
||||
res
|
||||
.addAll(
|
||||
prepareListStructPropsWithValidQualifier(
|
||||
|
@ -524,7 +525,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
return res
|
||||
.stream()
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
|
@ -73,14 +73,10 @@ public class GraphHiveImporterJobTest {
|
|||
GraphHiveImporterJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||
"-hiveMetastoreUris",
|
||||
"",
|
||||
"-hiveDbName",
|
||||
dbName
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
|
||||
"--hiveMetastoreUris", "",
|
||||
"--hiveDbName", dbName
|
||||
});
|
||||
|
||||
ModelSupport.oafTypes
|
||||
|
|
|
@ -388,7 +388,7 @@ public class CleanGraphSparkJobTest {
|
|||
.collect(Collectors.toList());
|
||||
|
||||
assertNotNull(fos_subjects);
|
||||
assertEquals(2, fos_subjects.size());
|
||||
assertEquals(3, fos_subjects.size());
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
|
@ -396,18 +396,10 @@ public class CleanGraphSparkJobTest {
|
|||
.anyMatch(
|
||||
s -> "0101 mathematics".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"sysimport:crosswalk:datasetarchive"
|
||||
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
|
||||
verify_keyword(p, "In Situ Hybridization");
|
||||
verify_keyword(p, "Avicennia");
|
||||
verify_keyword(p, "FOS: Mathematics");
|
||||
verify_keyword(p, "FOS: Computer and information sciences");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -266,7 +266,7 @@ public class GraphCleaningFunctionsTest {
|
|||
.collect(Collectors.toList());
|
||||
|
||||
assertNotNull(fos_subjects);
|
||||
assertEquals(2, fos_subjects.size());
|
||||
assertEquals(3, fos_subjects.size());
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
|
@ -274,18 +274,18 @@ public class GraphCleaningFunctionsTest {
|
|||
.anyMatch(
|
||||
s -> "0101 mathematics".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"sysimport:crosswalk:datasetarchive"
|
||||
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
|
||||
|
||||
verify_keyword(p_cleaned, "In Situ Hybridization");
|
||||
verify_keyword(p_cleaned, "Avicennia");
|
||||
verify_keyword(p_cleaned, "FOS: Computer and information sciences");
|
||||
verify_keyword(p_cleaned, "FOS: Mathematics");
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_cleaned));
|
||||
|
|
|
@ -44,7 +44,7 @@ class GenerateEntitiesApplicationTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testMergeResult() throws IOException, DocumentException {
|
||||
void testMergeResult() throws IOException {
|
||||
Result publication = getResult("oaf_record.xml", Publication.class);
|
||||
Result dataset = getResult("odf_dataset.xml", Dataset.class);
|
||||
Result software = getResult("odf_software.xml", Software.class);
|
||||
|
@ -69,15 +69,15 @@ class GenerateEntitiesApplicationTest {
|
|||
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
|
||||
}
|
||||
|
||||
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
||||
protected <T extends Result> void verifyMerge(Result r1, Result r2, Class<T> clazz,
|
||||
String resultType) {
|
||||
final Result merge = (Result) MergeUtils.merge(publication, dataset);
|
||||
final Result merge = MergeUtils.checkedMerge(r1, r2, true);
|
||||
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
||||
assertEquals(resultType, merge.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz)
|
||||
throws IOException, DocumentException {
|
||||
throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
|
||||
return new OdfToOafMapper(vocs, false, true)
|
||||
.processMdRecord(xml)
|
||||
|
|
|
@ -216,7 +216,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testPublication_PubMed() throws IOException, DocumentException {
|
||||
void testPublication_PubMed() throws IOException {
|
||||
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record_pubmed.xml")));
|
||||
|
@ -264,8 +264,17 @@ class MappersTest {
|
|||
|
||||
assertFalse(p.getSubject().isEmpty());
|
||||
assertFalse(p.getPid().isEmpty());
|
||||
assertEquals("PMC1517292", p.getPid().get(0).getValue());
|
||||
assertEquals("pmc", p.getPid().get(0).getQualifier().getClassid());
|
||||
|
||||
assertTrue(p.getPid().stream().anyMatch(pi -> "pmc".equals(pi.getQualifier().getClassid())));
|
||||
assertEquals(
|
||||
"PMC1517292",
|
||||
p
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pi -> "pmc".equals(pi.getQualifier().getClassid()))
|
||||
.findFirst()
|
||||
.get()
|
||||
.getValue());
|
||||
|
||||
assertNotNull(p.getInstance());
|
||||
assertFalse(p.getInstance().isEmpty());
|
||||
|
@ -292,7 +301,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testPublicationInvisible() throws IOException, DocumentException {
|
||||
void testPublicationInvisible() throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
|
||||
|
||||
|
@ -307,6 +316,25 @@ class MappersTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPublicationInvisible_BASE() throws IOException {
|
||||
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record_base.xml")));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, true, true).processMdRecord(xml);
|
||||
|
||||
assertFalse(list.isEmpty());
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
|
||||
assertTrue(p.getDataInfo().getInvisible());
|
||||
|
||||
System.out.println(new ObjectMapper().writeValueAsString(p));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testOdfFwfEBookLibrary() throws IOException {
|
||||
final String xml = IOUtils
|
||||
|
@ -318,7 +346,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testDataset() throws IOException, DocumentException {
|
||||
void testDataset() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_dataset.xml")));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -332,19 +360,19 @@ class MappersTest {
|
|||
final Relation r1 = (Relation) list.get(1);
|
||||
final Relation r2 = (Relation) list.get(2);
|
||||
|
||||
assertEquals(d.getId(), r1.getSource());
|
||||
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getTarget());
|
||||
assertEquals(d.getId(), r1.getTarget());
|
||||
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getSource());
|
||||
assertEquals(ModelConstants.RESULT_PROJECT, r1.getRelType());
|
||||
assertEquals(ModelConstants.OUTCOME, r1.getSubRelType());
|
||||
assertEquals(ModelConstants.IS_PRODUCED_BY, r1.getRelClass());
|
||||
assertEquals(ModelConstants.PRODUCES, r1.getRelClass());
|
||||
assertTrue(r1.getValidated());
|
||||
assertEquals("2020-01-01", r1.getValidationDate());
|
||||
|
||||
assertEquals(d.getId(), r2.getTarget());
|
||||
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getSource());
|
||||
assertEquals(d.getId(), r2.getSource());
|
||||
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getTarget());
|
||||
assertEquals(ModelConstants.RESULT_PROJECT, r2.getRelType());
|
||||
assertEquals(ModelConstants.OUTCOME, r2.getSubRelType());
|
||||
assertEquals(ModelConstants.PRODUCES, r2.getRelClass());
|
||||
assertEquals(ModelConstants.IS_PRODUCED_BY, r2.getRelClass());
|
||||
assertTrue(r2.getValidated());
|
||||
assertEquals("2020-01-01", r2.getValidationDate());
|
||||
|
||||
|
@ -378,15 +406,15 @@ class MappersTest {
|
|||
assertEquals("Baracchini", author.get().getSurname());
|
||||
assertEquals("Theo", author.get().getName());
|
||||
|
||||
assertEquals(1, author.get().getAffiliation().size());
|
||||
final Optional<Field<String>> opAff = author
|
||||
assertEquals(1, author.get().getRawAffiliationString().size());
|
||||
final Optional<String> opAff = author
|
||||
.get()
|
||||
.getAffiliation()
|
||||
.getRawAffiliationString()
|
||||
.stream()
|
||||
.findFirst();
|
||||
assertTrue(opAff.isPresent());
|
||||
final Field<String> affiliation = opAff.get();
|
||||
assertEquals("ISTI-CNR", affiliation.getValue());
|
||||
final String affiliation = opAff.get();
|
||||
assertEquals("ISTI-CNR", affiliation);
|
||||
|
||||
assertFalse(d.getSubject().isEmpty());
|
||||
assertFalse(d.getInstance().isEmpty());
|
||||
|
@ -450,7 +478,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testOdfBielefeld() throws IOException, DocumentException {
|
||||
void testOdfBielefeld() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_bielefeld.xml")));
|
||||
|
||||
|
@ -501,7 +529,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testOpentrial() throws IOException, DocumentException {
|
||||
void testOpentrial() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_opentrial.xml")));
|
||||
|
||||
|
@ -741,7 +769,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testSoftware() throws IOException, DocumentException {
|
||||
void testSoftware() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_software.xml")));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -763,22 +791,21 @@ class MappersTest {
|
|||
final Relation r1 = (Relation) list.get(1);
|
||||
final Relation r2 = (Relation) list.get(2);
|
||||
|
||||
assertEquals(s.getId(), r1.getSource());
|
||||
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getTarget());
|
||||
assertEquals(s.getId(), r1.getTarget());
|
||||
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getSource());
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r1.getRelType());
|
||||
assertEquals(ModelConstants.RELATIONSHIP, r1.getSubRelType());
|
||||
assertEquals(ModelConstants.IS_REFERENCED_BY, r1.getRelClass());
|
||||
assertEquals(ModelConstants.REFERENCES, r1.getRelClass());
|
||||
|
||||
assertEquals(s.getId(), r2.getTarget());
|
||||
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getSource());
|
||||
assertEquals(s.getId(), r2.getSource());
|
||||
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getTarget());
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r2.getRelType());
|
||||
assertEquals(ModelConstants.RELATIONSHIP, r2.getSubRelType());
|
||||
assertEquals(ModelConstants.REFERENCES, r2.getRelClass());
|
||||
|
||||
assertEquals(ModelConstants.IS_REFERENCED_BY, r2.getRelClass());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testClaimDedup() throws IOException, DocumentException {
|
||||
void testClaimDedup() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_dedup.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -792,7 +819,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testNakala() throws IOException, DocumentException {
|
||||
void testNakala() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_nakala.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -820,7 +847,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testEnermaps() throws IOException, DocumentException {
|
||||
void testEnermaps() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("enermaps.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -845,7 +872,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testClaimFromCrossref() throws IOException, DocumentException {
|
||||
void testClaimFromCrossref() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_crossref.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -862,7 +889,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testODFRecord() throws IOException, DocumentException {
|
||||
void testODFRecord() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
|
@ -882,7 +909,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testTextGrid() throws IOException, DocumentException {
|
||||
void testTextGrid() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("textgrid.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -916,7 +943,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testBologna() throws IOException, DocumentException {
|
||||
void testBologna() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf-bologna.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -933,7 +960,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testJairo() throws IOException, DocumentException {
|
||||
void testJairo() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_jairo.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -971,7 +998,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testZenodo() throws IOException, DocumentException {
|
||||
void testZenodo() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -1016,7 +1043,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testOdfFromHdfs() throws IOException, DocumentException {
|
||||
void testOdfFromHdfs() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_from_hdfs.xml")));
|
||||
|
||||
|
@ -1065,7 +1092,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testXMLEncodedURL() throws IOException, DocumentException {
|
||||
void testXMLEncodedURL() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
|
@ -1081,7 +1108,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testXMLEncodedURL_ODF() throws IOException, DocumentException {
|
||||
void testXMLEncodedURL_ODF() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
@ -1245,7 +1272,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testRiunet() throws IOException, DocumentException {
|
||||
void testRiunet() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
|
@ -1291,7 +1318,7 @@ class MappersTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testIRISPub() throws IOException, DocumentException {
|
||||
void testIRISPub() throws IOException {
|
||||
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("iris-odf.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
System.out.println("***************");
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -794,28 +794,6 @@
|
|||
},
|
||||
"value": "FOS: Computer and information sciences"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
|
@ -831,8 +809,8 @@
|
|||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"classid": "FOS",
|
||||
"classname": "Fields of Science and Technology classification",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
|
@ -910,8 +888,8 @@
|
|||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:actionset",
|
||||
"classname": "Harvested",
|
||||
"classid": "subject:fos",
|
||||
"classname": "subject:fos",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
|
@ -923,7 +901,7 @@
|
|||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Avicennia"
|
||||
"value": "0102 computer and information sciences"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<dri:objIdentifier>base_oa_____::7ecf1ef502253efffe203ca9a22bb9f1</dri:objIdentifier>
|
||||
<identifier>ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</identifier>
|
||||
<datestamp>2020-12-22T10:30:27Z</datestamp>
|
||||
<dr:dateOfTransformation>2024-09-10T17:21:36.972Z</dr:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<datacite:resource>
|
||||
<datacite:identifier identifierType="DOI">https://doi.org/10.1016/j.envint.2014.07.004</datacite:identifier>
|
||||
<datacite:alternateIdentifiers>
|
||||
<datacite:identifier alternateIdentifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</datacite:identifier>
|
||||
<datacite:identifier alternateIdentifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</datacite:identifier>
|
||||
</datacite:alternateIdentifiers>
|
||||
<datacite:relatedIdentifiers/>
|
||||
<datacite:resourceType>Article contribution</datacite:resourceType>
|
||||
<datacite:titles>
|
||||
<datacite:title>The role of environmental factors in the spatial distribution of Japanese encephalitis in mainland China</datacite:title>
|
||||
</datacite:titles>
|
||||
<datacite:creators>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Wang, Liya</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Hu, Wenbiao</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Soares Magalhaes, Ricardo J.</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Bi, Peng</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Ding, Fan</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Sun, Hailong</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Li, Shenlong</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Yin, Wenwu</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Wei, Lan</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Liu, Qiyong</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Haque, Ubydul</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Sun, Yansong</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Huang, Liuyu</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Tong, Shilu</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Clements, Archie C.A.</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Zhang, Wenyi</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>Li, Chengyi</datacite:creatorName>
|
||||
</datacite:creator>
|
||||
</datacite:creators>
|
||||
<datacite:contributors/>
|
||||
<datacite:descriptions>
|
||||
<datacite:description descriptionType="Abstract">Japanese encephalitis (JE) is the most common cause of viral encephalitis and an important public health concern in the Asia-Pacific region, particularly in China where 50% of global cases are notified. To explore the association between environmental factors and human JE cases and identify the high risk areas for JE transmission in China, we used annual notified data on JE cases at the center of administrative township and environmental variables with a pixel resolution of 1. km. ×. 1. km from 2005 to 2011 to construct models using ecological niche modeling (ENM) approaches based on maximum entropy. These models were then validated by overlaying reported human JE case localities from 2006 to 2012 onto each prediction map. ENMs had good discriminatory ability with the area under the curve (AUC) of the receiver operating curve (ROC) of 0.82-0.91, and low extrinsic omission rate of 5.44-7.42%. Resulting maps showed JE being presented extensively throughout southwestern and central China, with local spatial variations in probability influenced by minimum temperatures, human population density, mean temperatures, and elevation, with contribution of 17.94%-38.37%, 15.47%-21.82%, 3.86%-21.22%, and 12.05%-16.02%, respectively. Approximately 60% of JE cases occurred in predicted high risk areas, which covered less than 6% of areas in mainland China. Our findings will help inform optimal geographical allocation of the limited resources available for JE prevention and control in China, find hidden high-risk areas, and increase the effectiveness of public health interventions against JE transmission.</datacite:description>
|
||||
</datacite:descriptions>
|
||||
<datacite:subjects>
|
||||
<datacite:subject>Japanese encephalitis</datacite:subject>
|
||||
<datacite:subject>Ecological niche model</datacite:subject>
|
||||
<datacite:subject>MaxEnt</datacite:subject>
|
||||
<datacite:subject>China</datacite:subject>
|
||||
<datacite:subject>2300 Environmental Science</datacite:subject>
|
||||
<datacite:subject classificationCode="950" subjectScheme="ddc">950</datacite:subject>
|
||||
</datacite:subjects>
|
||||
<datacite:publisher>Pergamon Press</datacite:publisher>
|
||||
<datacite:publicationYear>2014</datacite:publicationYear>
|
||||
<datacite:formats/>
|
||||
<datacite:language>eng</datacite:language>
|
||||
<oaf:accessrights/>
|
||||
</datacite:resource>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:accessrights>UNKNOWN</oaf:accessrights>
|
||||
<oaf:identifier identifierType="doi">10.1163/qwerty</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">0.1163/18763308-90001038</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">https://doi.org/10.1016/j.envint.2014.07.004</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">https://doi.org/10.1080/09672567.2013.792375</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">http://doi.org/10.1080/08673487.2012.812376</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">http://dx.doi.org/10.1090/08673487.2012.812376</oaf:identifier>
|
||||
<oaf:identifier identifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</oaf:identifier>
|
||||
<oaf:identifier identifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</oaf:identifier>
|
||||
<oaf:hostedBy name="The University of Queensland: UQ eSpace" id="opendoar____::575"/>
|
||||
<oaf:collectedFrom name="Bielefeld Academic Search Engine (BASE)"
|
||||
id="openaire____::base_search"/>
|
||||
<oaf:dateAccepted>2014-12-01</oaf:dateAccepted>
|
||||
<oaf:relation relClass="hasAuthorInstitution"
|
||||
relType="resultOrganization"
|
||||
subRelType="affiliation"
|
||||
targetType="organization">ror_________::https://ror.org/00rqy9422</oaf:relation>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.89</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
|
||||
classname="sysimport:crosswalk:aggregator"
|
||||
schemeid="dnet:provenanceActions"
|
||||
schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</metadata>
|
||||
</record>
|
|
@ -130,5 +130,10 @@
|
|||
"value": [
|
||||
"Pippo", "Foo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "typology",
|
||||
"type": "string",
|
||||
"value": "Government"
|
||||
}
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -31,5 +31,11 @@ class ORCIDAuthorMatchersTest {
|
|||
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
|
||||
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
|
||||
}
|
||||
@Test def testDocumentationNames(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
|
||||
}
|
||||
|
||||
@Test def testDocumentationNames2(): Unit = {
|
||||
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,12 +4,13 @@ import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
|
|||
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.junit.jupiter.api.{Disabled, Test}
|
||||
import org.objenesis.strategy.StdInstantiatorStrategy
|
||||
|
||||
class ScholixGenerationTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
def generateScholix(): Unit = {
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
|
||||
|
|
|
@ -175,7 +175,8 @@ public class XmlSerializationUtils {
|
|||
.append("<")
|
||||
.append(name)
|
||||
.append(" ")
|
||||
.append(attr(measure.getId(), kv.getValue()))
|
||||
.append(attr("id", measure.getId()))
|
||||
.append(attr("score", kv.getValue()))
|
||||
.append(attr("datasource", kv.getKey()))
|
||||
.append(" />");
|
||||
}
|
||||
|
|
|
@ -8,6 +8,26 @@
|
|||
}
|
||||
],
|
||||
"measures": [
|
||||
{
|
||||
"id": "views",
|
||||
"unit": [
|
||||
{
|
||||
"key": "opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO",
|
||||
"value": "5",
|
||||
"dataInfo": null
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "downloads",
|
||||
"unit": [
|
||||
{
|
||||
"key": "opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO",
|
||||
"value": "2",
|
||||
"dataInfo": null
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "influence",
|
||||
"unit": [
|
||||
|
|
|
@ -32,7 +32,7 @@ select distinct * from (
|
|||
from SOURCE.result r
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder
|
||||
join TARGET.irish_funders irf on irf.funder=p.funder
|
||||
union all
|
||||
select r.*
|
||||
from SOURCE.result r
|
||||
|
|
|
@ -1,79 +1,3 @@
|
|||
--drop database if exists TARGET cascade;
|
||||
--create database if not exists TARGET;
|
||||
--
|
||||
--create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
--create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
--create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
--create view if not exists TARGET.country as select * from SOURCE.country;
|
||||
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
|
||||
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
|
||||
--create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
|
||||
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
|
||||
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
|
||||
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
|
||||
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
|
||||
--
|
||||
--create table TARGET.result stored as parquet as
|
||||
-- select distinct * from (
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
|
||||
-- union all
|
||||
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
|
||||
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
|
||||
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
|
||||
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
|
||||
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
|
||||
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
|
||||
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
|
||||
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
|
||||
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
|
||||
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
|
||||
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
|
||||
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
|
||||
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
|
||||
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
|
||||
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
|
||||
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
|
||||
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
|
||||
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
|
||||
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
|
||||
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
|
||||
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
|
||||
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
|
||||
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
|
||||
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
|
||||
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
|
||||
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
|
||||
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
|
||||
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
|
||||
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
|
||||
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
|
||||
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
|
||||
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
|
||||
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
|
||||
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
|
||||
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
|
||||
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
|
||||
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
|
||||
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
|
||||
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
|
||||
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
|
||||
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
|
||||
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
|
||||
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
|
||||
-- ) )) foo;
|
||||
--
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
|
|
|
@ -81,7 +81,17 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
||||
|
|
|
@ -61,7 +61,17 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
))) foo;
|
||||
|
||||
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
|
|
@ -0,0 +1,18 @@
|
|||
# Install the whole "dnet-hadoop" project.
|
||||
|
||||
# Delete this module's previous build-files in order to avoid any conflicts.
|
||||
rm -rf target/ ||
|
||||
|
||||
# Go to the root directory of this project.
|
||||
cd ../../
|
||||
|
||||
# Select the build profile.
|
||||
DEFAULT_PROFILE='' # It's the empty profile.
|
||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
||||
|
||||
# Install the project.
|
||||
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
|
||||
|
||||
# We skip tests for all modules, since the take a big amount of time and some of them fail.
|
||||
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.
|
|
@ -0,0 +1,20 @@
|
|||
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
|
||||
|
||||
# Select the build profile.
|
||||
DEFAULT_PROFILE='' # It's the empty profile.
|
||||
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
|
||||
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
|
||||
|
||||
# Build and deploy this module.
|
||||
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
|
||||
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
|
||||
|
||||
# Show the Oozie-job-ID.
|
||||
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
|
||||
cat ./target/extract-and-run-on-remote-host.log
|
||||
|
||||
# Check oozie workflow status
|
||||
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
|
||||
|
||||
# Get the <job-ID> from the previous output and check the logs:
|
||||
# yarn logs -applicationId application_<job-ID>
|
|
@ -1,8 +1,10 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Stats database creation
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
DROP database IF EXISTS ${stats_db_name} CASCADE;
|
||||
CREATE database ${stats_db_name};
|
||||
DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
|
||||
CREATE database ${stats_db_name}; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
||||
|
@ -5,27 +7,27 @@
|
|||
------------------------------------------------------------------------------------------------
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.fundref;
|
||||
FROM ${external_stats_db_name}.fundref; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.country AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.country;
|
||||
FROM ${external_stats_db_name}.country; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.countrygdp;
|
||||
FROM ${external_stats_db_name}.countrygdp; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.roarmap;
|
||||
FROM ${external_stats_db_name}.roarmap; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.rndexpediture;
|
||||
FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.licenses_normalized;
|
||||
FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
|
@ -33,23 +35,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
|
|||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
create or replace view ${stats_db_name}.usage_stats as
|
||||
select * from openaire_prod_usage_stats.usage_stats;
|
||||
select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.downloads_stats as
|
||||
select * from openaire_prod_usage_stats.downloads_stats;
|
||||
select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.pageviews_stats as
|
||||
select * from openaire_prod_usage_stats.pageviews_stats;
|
||||
select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.views_stats as
|
||||
select * from openaire_prod_usage_stats.views_stats;
|
||||
select * from openaire_prod_usage_stats.views_stats; /*EOS*/
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Creation date of the database
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.creation_date STORED AS PARQUET as
|
||||
select date_format(current_date(), 'dd-MM-yyyy') as date;
|
||||
select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/
|
||||
|
|
|
@ -1,110 +1,11 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
-- Post processing - Updates on main tables
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
|
||||
--Datasource temporary table updates
|
||||
UPDATE ${stats_db_name}.datasource_tmp
|
||||
SET harvested='true'
|
||||
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
|
||||
FROM ${stats_db_name}.datasource_tmp d,
|
||||
${stats_db_name}.result_datasources rd
|
||||
WHERE d.id = rd.datasource);
|
||||
|
||||
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
||||
UPDATE ${stats_db_name}.project_tmp
|
||||
SET haspubs='yes'
|
||||
WHERE project_tmp.id IN (SELECT pr.id
|
||||
FROM ${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.result r
|
||||
WHERE pr.result = r.id
|
||||
AND r.type = 'publication');
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||
SELECT p.id,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.funder,
|
||||
p.funding_lvl0,
|
||||
p.funding_lvl1,
|
||||
p.funding_lvl2,
|
||||
p.ec39,
|
||||
p.type,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.start_year,
|
||||
p.end_year,
|
||||
p.duration,
|
||||
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
|
||||
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
||||
p.callidentifier,
|
||||
p.code,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
p.currency
|
||||
FROM ${stats_db_name}.project_tmp p
|
||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||
FROM ${stats_db_name}.project_results pr
|
||||
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
|
||||
WHERE r.type = 'publication'
|
||||
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
||||
LEFT JOIN (SELECT pp.id,
|
||||
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
|
||||
count(distinct r.id) AS dp
|
||||
FROM ${stats_db_name}.project_tmp pp,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.result r
|
||||
WHERE pp.id = pr.id
|
||||
AND pr.result = r.id
|
||||
AND r.type = 'publication'
|
||||
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
||||
GROUP BY pp.id) AS prr2
|
||||
ON prr2.id = p.id;
|
||||
|
||||
UPDATE ${stats_db_name}.publication_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE publication_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.dataset_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE dataset_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.software_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE software_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
UPDATE ${stats_db_name}.otherresearchproduct_tmp
|
||||
SET delayed = 'yes'
|
||||
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
|
||||
FROM ${stats_db_name}.result r,
|
||||
${stats_db_name}.project_results pr,
|
||||
${stats_db_name}.project_tmp p
|
||||
WHERE r.id = pr.result
|
||||
AND pr.id = p.id
|
||||
AND to_date(r.date) - to_date(p.enddate) > 0);
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
|
||||
SELECT result_projects.id AS result,
|
||||
result_projects.project AS project_results,
|
||||
|
@ -116,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
|
|||
${stats_db_name}.project
|
||||
WHERE result_projects.id = result.id
|
||||
AND result.type = 'publication'
|
||||
AND project.id = result_projects.project;
|
||||
AND project.id = result_projects.project; /*EOS*/
|
|
@ -1,42 +1,4 @@
|
|||
------------------------------------------------------------------------------------------------------
|
||||
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
||||
------------------------------------------------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.datasource_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.publication_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.dataset_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.software_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
DROP TABLE ${stats_db_name}.project_tmp;
|
||||
DROP TABLE ${stats_db_name}.datasource_tmp;
|
||||
DROP TABLE ${stats_db_name}.publication_tmp;
|
||||
DROP TABLE ${stats_db_name}.dataset_tmp;
|
||||
DROP TABLE ${stats_db_name}.software_tmp;
|
||||
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------
|
||||
-- Re-creating views from final parquet tables
|
||||
|
@ -54,4 +16,4 @@ SELECT *, bestlicence AS access_mode
|
|||
FROM ${stats_db_name}.dataset
|
||||
UNION ALL
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.otherresearchproduct;
|
||||
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
@ -5,10 +7,10 @@
|
|||
-- Sources related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -16,12 +18,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -29,12 +31,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -42,12 +44,12 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
|
@ -55,7 +57,7 @@ LEFT OUTER JOIN
|
|||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
|
||||
SELECT * FROM ${stats_db_name}.publication_sources
|
||||
|
@ -64,24 +66,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
|
|||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_sources
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||
select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
|
||||
from (
|
||||
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
||||
FROM ${openaire_db_name}.result res
|
||||
LATERAL VIEW explode(author) a as auth
|
||||
LATERAL VIEW explode(auth.pid) ap as auth_pid
|
||||
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
|
||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
|
||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
|
||||
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||
select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -91,12 +93,12 @@ where reltype='resultResult'
|
|||
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
|
||||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
|
||||
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -108,12 +110,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
|||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(target, 4);
|
||||
group by substr(target, 4); /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
|
||||
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
join ${openaire_db_name}.result r2 on r2.id=rel.target
|
||||
|
@ -125,4 +127,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
|||
and r1.resulttype.classname != 'other'
|
||||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(source, 4);
|
||||
group by substr(source, 4); /*EOS*/
|
|
@ -1,4 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
@ -6,33 +7,33 @@ set mapred.job.queue.name=analytics;
|
|||
-- Licences related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
|
||||
SELECT * FROM ${stats_db_name}.publication_licenses
|
||||
|
@ -41,29 +42,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
|
|||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_licenses
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
|
||||
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
|
||||
select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
|
||||
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
||||
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
|
||||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
|
||||
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||
select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
|
||||
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE;
|
||||
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics;
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -18,15 +18,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -36,15 +36,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -54,15 +54,15 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
|
||||
with peer_reviewed as (
|
||||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
|
@ -72,13 +72,13 @@ non_peer_reviewed as (
|
|||
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
|
||||
select distinct *
|
||||
select /*+ COALESCE(100) */ distinct *
|
||||
from (
|
||||
select peer_reviewed.* from peer_reviewed
|
||||
union all
|
||||
select non_peer_reviewed.* from non_peer_reviewed
|
||||
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
|
||||
where peer_reviewed.id is null) pr;
|
||||
where peer_reviewed.id is null) pr; /*EOS*/
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
||||
select * from ${stats_db_name}.publication_refereed
|
||||
|
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
|
|||
union all
|
||||
select * from ${stats_db_name}.software_refereed
|
||||
union all
|
||||
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
||||
select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
|
||||
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
||||
select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
|
||||
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
|
||||
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
|
||||
where measures_ids.id!='views' and measures_ids.id!='downloads';
|
||||
where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
|
||||
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
||||
select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
|
||||
cast(rel.properties[0].value as double) apc_amount,
|
||||
rel.properties[1].value apc_currency
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.organization o on o.id=rel.source
|
||||
join ${openaire_db_name}.result r on r.id=rel.target
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;
|
||||
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/
|
||||
|
|
|
@ -1,27 +1,27 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
-------------------------------------------
|
||||
--- Extra tables, mostly used by indicators
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
|
||||
select r.id, count(distinct p.id) as count
|
||||
select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||
group by r.id;
|
||||
group by r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
|
||||
select r.id, count(distinct p.funder) as count
|
||||
select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||
group by r.id;
|
||||
group by r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
|
||||
with rcount as (
|
||||
|
@ -30,39 +30,39 @@ with rcount as (
|
|||
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
|
||||
left outer join ${stats_db_name}.result r on r.id=rp.id
|
||||
group by r.type, p.id )
|
||||
select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
||||
select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
|
||||
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
|
||||
sum(case when rcount.type='software' then rcount.count else 0 end) as software,
|
||||
sum(case when rcount.type='other' then rcount.count else 0 end) as other
|
||||
from rcount
|
||||
group by rcount.pid;
|
||||
group by rcount.pid; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture;
|
||||
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure;
|
||||
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents;
|
||||
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers;
|
||||
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft;
|
||||
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst;
|
||||
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates;
|
||||
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
|
||||
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
|
||||
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
|
||||
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
|
||||
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
|
||||
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
|
||||
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_instance stored as parquet as
|
||||
select distinct r.*
|
||||
select /*+ COALESCE(100) */ distinct r.*
|
||||
from (
|
||||
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
|
||||
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
|
||||
join ${stats_db_name}.result res on res.id=r.id;
|
||||
join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
|
||||
select distinct r.id, r.amount, r.currency
|
||||
select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency
|
||||
from (
|
||||
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
|
||||
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
|
||||
join ${stats_db_name}.result res on res.id=r.id
|
||||
where r.amount is not null;
|
||||
where r.amount is not null; /*EOS*/
|
||||
|
||||
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;
|
||||
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/
|
|
@ -1,7 +1,7 @@
|
|||
-- Sprint 1 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
|
||||
select distinct p.id, coalesce(green_oa, 0) as green_oa
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as green_oa
|
||||
|
@ -12,7 +12,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
|
||||
select distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as grey_lit
|
||||
|
@ -23,7 +23,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
|
||||
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
|
||||
|
@ -33,7 +33,7 @@ left outer join (
|
|||
-- Sprint 2 ----
|
||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
|
||||
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||
select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select r.id, license.type as lic from ${stats_db_name}.result r
|
||||
|
@ -42,7 +42,7 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
|
||||
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||
select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select r.id, lower(parse_url(license.type, "HOST")) as lic_host
|
||||
|
@ -52,12 +52,12 @@ left outer join (
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
|
||||
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
||||
select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
|
||||
from ${stats_db_name}.publication; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
|
||||
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
|
||||
|
@ -66,7 +66,7 @@ left outer join (
|
|||
---- Sprint 3 ----
|
||||
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
|
||||
select distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||
select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref
|
||||
from ${stats_db_name}.project_results r
|
||||
left outer join (
|
||||
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
|
||||
|
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
|
|||
SELECT ro.organization organization, ro.id, o.name
|
||||
from ${stats_db_name}.result_organization ro
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
|
||||
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
|
||||
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
|
||||
|
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
|
|||
from ${stats_db_name}.result_organization ro
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
where country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
|
||||
from tmp as o1 join tmp as o2 on o1.id=o2.id
|
||||
where o1.id=o2.id and o1.country!=o2.country
|
||||
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
|
||||
|
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
|
|||
select o.id organization, o.name, ro.project as project
|
||||
from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
|
||||
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.name<>o2.name
|
||||
|
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
|
|||
select o.id organization, o.name, o.country , ro.project as project
|
||||
from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.project=o2.project
|
||||
where o1.organization<>o2.organization and o1.country<>o2.country
|
||||
|
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
|
|||
join ${stats_db_name}.organization o on o.id=op.id
|
||||
join ${stats_db_name}.project p on p.id=op.project
|
||||
where country <> 'UNKNOWN')
|
||||
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
||||
select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
|
||||
from tmp as f1
|
||||
join tmp as f2 on f1.project=f2.project
|
||||
where f1.country<>f2.country
|
||||
|
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
|||
select distinct country, ro.id as result from ${stats_db_name}.organization o
|
||||
join ${stats_db_name}.result_organization ro on o.id=ro.organization
|
||||
where country <> 'UNKNOWN' and o.name is not null)
|
||||
select o1.country country1, o2.country country2, count(o1.result) as collaborations
|
||||
select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.country<>o2.country
|
||||
|
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
|
|||
---- Sprint 4 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
|
||||
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
|
||||
from ${stats_db_name}.publication_datasources pd
|
||||
left outer join (
|
||||
select pd.id, 1 as in_diamond_journal
|
||||
|
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
|
||||
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative
|
||||
from ${stats_db_name}.publication pd
|
||||
left outer join (
|
||||
select pd.id, 1 as is_transformative
|
||||
|
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
|
||||
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
||||
select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
|
||||
from ${stats_db_name}.result_instance ri
|
||||
left outer join (
|
||||
select ri.id, 1 as pub_closed_other_open
|
||||
|
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
|
|||
---- Sprint 5 ----
|
||||
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
|
||||
select id, count(id) as number_of_copies
|
||||
select /*+ COALESCE(100) */ id, count(id) as number_of_copies
|
||||
from ${stats_db_name}.result_instance
|
||||
group by id; /*EOS*/
|
||||
|
||||
---- Sprint 6 ----
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
|
||||
SELECT result_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
|
||||
SELECT result_id, repository_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
|
||||
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats us
|
||||
join ${stats_db_name}.publication on result_id=id where downloads>0
|
||||
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
|
||||
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
||||
SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
|
||||
from openaire_prod_usage_stats.usage_stats us
|
||||
join ${stats_db_name}.publication on result_id=id
|
||||
where downloads>0
|
||||
|
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
|
|||
UNION ALL
|
||||
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
|
||||
)
|
||||
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
|
||||
FROM ${stats_db_name}.publication pd
|
||||
left outer join (
|
||||
select pd.id, 1 as is_gold
|
||||
|
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
|||
FROM ${stats_db_name}.datasource
|
||||
WHERE issn_online IS NOT NULL ) as issn
|
||||
WHERE LENGTH(issn) > 7)
|
||||
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
|
||||
FROM ${stats_db_name}.publication_datasources pd
|
||||
LEFT OUTER JOIN (
|
||||
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
|
||||
|
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
|
||||
select distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as is_hybrid
|
||||
|
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
|
|||
where cast(year as int)>2003
|
||||
group by ro.organization)
|
||||
--return results_fair/all_results
|
||||
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization; /*EOS*/
|
||||
|
||||
|
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
|
||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
from allresults ar
|
||||
join result_fair rf on rf.organization=ar.organization; /*EOS*/
|
||||
|
||||
|
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
|
||||
|
||||
|
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
from allresults ar join result_fair rf
|
||||
on rf.organization=ar.organization; /*EOS*/
|
||||
|
||||
|
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
|
||||
|
||||
|
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
|
||||
select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
from allresults
|
||||
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
|
||||
|
||||
|
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
|
|||
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
|
||||
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
|
||||
from allresults
|
||||
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
|
||||
|
||||
|
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
|
|||
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
|
||||
select allpubsshare.organization,
|
||||
select /*+ COALESCE(100) */ allpubsshare.organization,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
org_openess FROM allpubsshare
|
||||
|
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
|
|||
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
|
||||
select cast(allpubsshare.year as int) year, allpubsshare.organization,
|
||||
select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
org_openess FROM allpubsshare
|
||||
|
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
|
||||
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint
|
||||
from ${stats_db_name}.publication_classifications p
|
||||
left outer join (
|
||||
select p.id, 1 as has_preprint
|
||||
|
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
|
|||
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
|
||||
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join(
|
||||
select p.id, 1 as is_subscription from ${stats_db_name}.publication p
|
||||
|
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
|
|||
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
||||
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||
from ${stats_db_name}.result p
|
||||
left outer join (
|
||||
select p.id, 1 as result_with_pid
|
||||
|
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
||||
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||
select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||
as is_interdisciplinary
|
||||
from pub_fos_totals p
|
||||
left outer join (
|
||||
|
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
|
||||
select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
||||
select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select p.id, 1 as is_bronze_oa
|
||||
|
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
|
||||
select pry.project_id, pry.acronym, pry.result_id,
|
||||
select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id,
|
||||
coalesce(is_project_result_after, 0) as is_project_result_after
|
||||
from project_year_result_year pry
|
||||
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
|
||||
|
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
|
|||
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
|
||||
select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
||||
select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
|
||||
from ${stats_db_name}.funder f
|
||||
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
|
||||
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
|
||||
|
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
|
|||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
where cast(year as int)>2003
|
||||
group by p.funder)
|
||||
select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
||||
select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.funder=allresults.funder; /*EOS*/
|
||||
|
||||
|
@ -745,7 +745,7 @@ allresults as
|
|||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
where cast(year as int)>2003
|
||||
group by rc.ri_initiative)
|
||||
select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
|
||||
from allresults
|
||||
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||
|
||||
|
@ -817,16 +817,14 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
|
|||
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
|
||||
select allpubsshare.funder,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
funder_openess FROM allpubsshare
|
||||
left outer join (select funder,d from
|
||||
alldatasetssshare) tmp1
|
||||
on tmp1.funder=allpubsshare.funder
|
||||
left outer join (select funder,s from
|
||||
allsoftwaresshare) tmp2
|
||||
on tmp2.funder=allpubsshare.funder; /*EOS*/
|
||||
select /*+ COALESCE(100) */ allpubsshare.funder,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end)) funder_openess
|
||||
FROM allpubsshare
|
||||
left outer join (select funder,d from alldatasetssshare) tmp1
|
||||
on tmp1.funder=allpubsshare.funder
|
||||
left outer join (select funder,s from allsoftwaresshare) tmp2
|
||||
on tmp2.funder=allpubsshare.funder; /*EOS*/
|
||||
|
||||
DROP VIEW pubs_oa; /*EOS*/
|
||||
DROP VIEW datasets_oa; /*EOS*/
|
||||
|
@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
|
|||
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
|
||||
select allpubsshare.ri_initiative,
|
||||
select /*+ COALESCE(100) */ allpubsshare.ri_initiative,
|
||||
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
|
||||
+(case when d is null then 0 else 1 end))
|
||||
ris_openess FROM allpubsshare
|
||||
|
@ -943,7 +941,7 @@ with result_findable as
|
|||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
where cast(year as int)>2003
|
||||
group by p.funder)
|
||||
select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
||||
select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
|
||||
from allresults
|
||||
join result_findable on result_findable.funder=allresults.funder; /*EOS*/
|
||||
|
||||
|
@ -952,41 +950,43 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/
|
|||
|
||||
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
|
||||
with result_contexts as
|
||||
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
|
||||
join ${stats_db_name}.concept on concept.id=rc.concept
|
||||
join ${stats_db_name}.category on category.id=concept.category
|
||||
join ${stats_db_name}.context on context.id=category.context),
|
||||
result_findable as
|
||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
|
||||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
join ${stats_db_name}.result_pids rp on rp.id=r.id
|
||||
where cast(r.year as int)>2003
|
||||
group by rc.ri_initiative),
|
||||
allresults as
|
||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
|
||||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
where cast(r.year as int)>2003
|
||||
group by rc.ri_initiative)
|
||||
select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
||||
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
|
||||
join ${stats_db_name}.concept on concept.id=rc.concept
|
||||
join ${stats_db_name}.category on category.id=concept.category
|
||||
join ${stats_db_name}.context on context.id=category.context),
|
||||
result_findable as
|
||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
|
||||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
join ${stats_db_name}.result_pids rp on rp.id=r.id
|
||||
where cast(r.year as int)>2003
|
||||
group by rc.ri_initiative),
|
||||
allresults as
|
||||
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
|
||||
join ${stats_db_name}.result r on r.id=rc.id
|
||||
where cast(r.year as int)>2003
|
||||
group by rc.ri_initiative)
|
||||
select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
|
||||
from allresults
|
||||
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
|
||||
with org_names_pids as
|
||||
(select org.id,name, pid from ${stats_db_name}.organization org
|
||||
join ${stats_db_name}.organization_pids op on org.id=op.id),
|
||||
publicly_funded_orgs as
|
||||
(select distinct name from
|
||||
(select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
|
||||
union all
|
||||
select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join ${stats_db_name}.project p on p.funder=pf.name
|
||||
union all
|
||||
select op.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
|
||||
and pf.publicly_funded='yes') foo)
|
||||
select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
||||
(select org.id,name, pid from ${stats_db_name}.organization org
|
||||
join ${stats_db_name}.organization_pids op on org.id=op.id),
|
||||
publicly_funded_orgs as
|
||||
(select distinct name from
|
||||
(select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
|
||||
union all
|
||||
select pf.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join ${stats_db_name}.project p on p.funder=pf.name
|
||||
union all
|
||||
select op.name from stats_ext.insitutions_for_publicly_funded pf
|
||||
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
|
||||
and pf.publicly_funded='yes') foo)
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
|
||||
|
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
|
||||
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
|
||||
select distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
||||
select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license
|
||||
from ${stats_db_name}.publication p
|
||||
left outer join (
|
||||
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
|
||||
|
@ -1006,7 +1006,7 @@ left outer join (
|
|||
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.result_country stored as parquet as
|
||||
select distinct id, country
|
||||
select /*+ COALESCE(100) */ distinct id, country
|
||||
from (
|
||||
select ro.id, o.country
|
||||
from ${stats_db_name}.result_organization ro
|
||||
|
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
|
||||
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
|
||||
select distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
|
||||
|
@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open
|
|||
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
|
||||
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
|
||||
with without_license as
|
||||
(select distinct id from ${stats_db_name}.indi_result_oa_with_license
|
||||
where oa_with_license=0)
|
||||
select distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
||||
(select distinct id from ${stats_db_name}.indi_result_oa_with_license
|
||||
where oa_with_license=0)
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (select distinct r.id, 1 as oa_without_license
|
||||
from ${stats_db_name}.result r
|
||||
|
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
|
|||
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
|
||||
with transformative_dois as (
|
||||
select distinct doi from stats_ext.transformative_facts)
|
||||
select distinct r.id, coalesce(under_transformative,0) as under_transformative
|
||||
select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select distinct rp.id, 1 as under_transformative
|
||||
|
|
|
@ -1,30 +1,30 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
drop table if exists ${stats_db_name}.result_peerreviewed purge;
|
||||
drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
|
||||
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
|
||||
|
||||
-- Green OA:
|
||||
drop table if exists ${stats_db_name}.result_greenoa purge;
|
||||
drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
|
||||
select r.id, case when green.green_oa=1 then true else false end as green
|
||||
select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
|
||||
|
||||
-- GOLD OA:
|
||||
drop table if exists ${stats_db_name}.result_gold purge;
|
||||
drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
|
||||
|
||||
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
|
||||
select r.id, case when gold.is_gold=1 then true else false end as gold
|
||||
select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/
|
|
@ -1,58 +1,26 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.result_tmp (
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
`date` STRING,
|
||||
`year` INT,
|
||||
bestlicence STRING,
|
||||
access_mode STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING ,
|
||||
peer_reviewed BOOLEAN,
|
||||
green BOOLEAN,
|
||||
gold BOOLEAN)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
drop view if exists ${stats_db_name}.result; /*EOS*/
|
||||
drop table if exists ${stats_db_name}.result; /*EOS*/
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.publication r
|
||||
CREATE TABLE ${stats_db_name}.result stored as parquet as
|
||||
SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM (
|
||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.publication)
|
||||
UNION ALL
|
||||
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.dataset)
|
||||
UNION ALL
|
||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.software)
|
||||
UNION ALL
|
||||
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
|
||||
FROM ${stats_db_name}.otherresearchproduct)
|
||||
) r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.dataset r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.software r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.otherresearchproduct r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
drop table if exists ${stats_db_name}.result;
|
||||
drop view if exists ${stats_db_name}.result;
|
||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||
drop table ${stats_db_name}.result_tmp;
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
@ -7,65 +7,65 @@ set mapred.job.queue.name=analytics;
|
|||
--------------------------------------------------------------
|
||||
|
||||
-- Publication temporary table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp purge;
|
||||
CREATE TABLE ${stats_db_name}.publication_tmp
|
||||
(
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
date STRING,
|
||||
year STRING,
|
||||
bestlicence STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING
|
||||
)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties ('transactional' = 'true');
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication purge; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.publication_tmp
|
||||
SELECT substr(p.id, 4) as id,
|
||||
p.title[0].value as title,
|
||||
p.publisher.value as publisher,
|
||||
p.journal.name as journal,
|
||||
p.dateofacceptance.value as date,
|
||||
date_format(p.dateofacceptance.value, 'yyyy') as year,
|
||||
p.bestaccessright.classname as bestlicence,
|
||||
p.embargoenddate.value as embargo_end_date,
|
||||
false as delayed,
|
||||
size(p.author) as authors,
|
||||
concat_ws('\u003B', p.source.value) as source,
|
||||
case when size(p.description) > 0 then true else false end as abstract,
|
||||
'publication' as type
|
||||
from ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
CREATE TABLE ${stats_db_name}.publication stored as parquet as
|
||||
with pub_pr as (
|
||||
select pub.id as pub_id, case when (to_date(pub.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||
from ${openaire_db_name}.publication pub
|
||||
join ${openaire_db_name}.relation rel
|
||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=pub.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false
|
||||
),
|
||||
pub_delayed as (
|
||||
select pub_id, max(delayed) as delayed
|
||||
from pub_pr
|
||||
group by pub_id
|
||||
)
|
||||
select /*+ COALESCE(100) */
|
||||
substr(pub.id, 4) as id,
|
||||
pub.title[0].value as title,
|
||||
pub.publisher.value as publisher,
|
||||
pub.journal.name as journal,
|
||||
pub.dateofacceptance.value as date,
|
||||
date_format(pub.dateofacceptance.value, 'yyyy') as year,
|
||||
pub.bestaccessright.classname as bestlicence,
|
||||
pub.embargoenddate.value as embargo_end_date,
|
||||
coalesce(pub_delayed.delayed, false) as delayed, -- It's delayed, when the publication was published after the end of at least one of its projects.
|
||||
size(pub.author) as authors,
|
||||
concat_ws('\u003B', pub.source.value) as source,
|
||||
case when size(pub.description) > 0 then true else false end as abstract,
|
||||
'publication' as type
|
||||
from ${openaire_db_name}.publication pub
|
||||
left outer join pub_delayed on pub.id=pub_delayed.pub_id
|
||||
where pub.datainfo.deletedbyinference = false and pub.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_classifications purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_classifications STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, instancetype.classname as type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, instancetype.classname as type
|
||||
from ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_concepts purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_concepts STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, case
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||
from ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.context) contexts as context
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_datasources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_datasources STORED AS PARQUET as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
||||
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
|
||||
|
@ -73,44 +73,44 @@ FROM (
|
|||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_languages STORED AS PARQUET AS
|
||||
select substr(p.id, 4) as id, p.language.classname as language
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) as id, p.language.classname as language
|
||||
FROM ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_pids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.pid) pids AS ppid
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_topics purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_topics STORED AS PARQUET as
|
||||
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
|
||||
FROM ${openaire_db_name}.publication p
|
||||
LATERAL VIEW explode(p.subject) subjects AS subject
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_citations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_citations STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
FROM ${openaire_db_name}.publication p
|
||||
lateral view explode(p.extrainfo) citations AS citation
|
||||
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
create view if not exists TARGET.concept as select * from SOURCE.concept;
|
||||
create view if not exists TARGET.context as select * from SOURCE.context;
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
@ -81,11 +83,17 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
|
||||
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
|
||||
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
|
||||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
) )) foo;
|
||||
|
||||
create view if not exists TARGET.category as select * from SOURCE.category;
|
||||
|
@ -256,7 +264,6 @@ create table TARGET.indi_pub_interdisciplinarity stored as parquet as select * f
|
|||
|
||||
create table TARGET.result_apc_affiliations stored as parquet as select * from SOURCE.result_apc_affiliations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_instance stored as parquet as select * from SOURCE.result_instance orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.result_orcid stored as parquet as select * from SOURCE.result_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
create table TARGET.indi_pub_publicly_funded stored as parquet as select * from SOURCE.indi_pub_publicly_funded orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
|
||||
create table TARGET.indi_is_project_result_after stored as parquet as select * from SOURCE.indi_is_project_result_after orig where exists (select 1 from TARGET.result r where r.id=orig.result_id);
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
drop database if exists TARGET cascade;
|
||||
create database if not exists TARGET;
|
||||
|
||||
|
@ -65,5 +67,11 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
|
||||
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
|
||||
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d' -- Centre for Research on Ecology and Forestry Applications
|
||||
))) foo;
|
||||
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
|
||||
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
|
||||
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
|
||||
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
|
||||
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
|
||||
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
|
||||
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
|
||||
))) foo;
|
|
@ -1,15 +1,17 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_cc_licence stored as parquet as
|
||||
select r.id, coalesce(rln.count, 0) > 0 as cc_licence
|
||||
select /*+ COALESCE(100) */ r.id, coalesce(rln.count, 0) > 0 as cc_licence
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select rl.id, sum(case when rl.type like 'CC%' then 1 else 0 end) as count
|
||||
from ${stats_db_name}.result_licenses rl
|
||||
group by rl.id
|
||||
) rln on rln.id=r.id;
|
||||
) rln on rln.id=r.id; /*EOS*/
|
||||
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -35,11 +37,11 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
|
||||
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_year stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -65,11 +67,11 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
|
||||
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -95,11 +97,11 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
|
||||
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -127,10 +129,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -158,10 +160,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -187,10 +189,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -216,10 +218,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -247,10 +249,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -278,10 +280,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -309,10 +311,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_year stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -340,11 +342,11 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year; /*EOS*/
|
||||
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -372,10 +374,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -403,10 +405,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -434,10 +436,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_organization stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -465,10 +467,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -496,10 +498,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_funder stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -529,10 +531,10 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder; /*EOS*/
|
||||
|
||||
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
|
||||
select
|
||||
select /*+ COALESCE(100) */
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
|
@ -562,4 +564,4 @@ from ${stats_db_name}.result r
|
|||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Dataset table/view and Dataset related tables/views
|
||||
|
@ -5,75 +7,74 @@
|
|||
------------------------------------------------------
|
||||
|
||||
-- Dataset temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_tmp
|
||||
(
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
date STRING,
|
||||
year STRING,
|
||||
bestlicence STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING
|
||||
CREATE TABLE ${stats_db_name}.dataset stored as parquet as
|
||||
with datast_pr as (
|
||||
select datast.id as datast_id, case when (to_date(datast.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||
from ${openaire_db_name}.dataset datast
|
||||
join ${openaire_db_name}.relation rel
|
||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=datast.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||
where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false
|
||||
),
|
||||
datast_delayed as (
|
||||
select datast_id, max(delayed) as delayed
|
||||
from datast_pr
|
||||
group by datast_id
|
||||
)
|
||||
clustered by (id) into 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
select /*+ COALESCE(100) */
|
||||
substr(datast.id, 4) as id,
|
||||
datast.title[0].value as title,
|
||||
datast.publisher.value as publisher,
|
||||
cast(null as string) as journal,
|
||||
datast.dateofacceptance.value as date,
|
||||
date_format(datast.dateofacceptance.value, 'yyyy') as year,
|
||||
datast.bestaccessright.classname as bestlicence,
|
||||
datast.embargoenddate.value as embargo_end_date,
|
||||
coalesce(datast_delayed.delayed, false) as delayed, -- It's delayed, when the dataset was published after the end of the project.
|
||||
size(datast.author) as authors,
|
||||
concat_ws('\u003B', datast.source.value) as source,
|
||||
case when size(datast.description) > 0 then true else false end as abstract,
|
||||
'dataset' as type
|
||||
from ${openaire_db_name}.dataset datast
|
||||
left outer join datast_delayed on datast.id=datast_delayed.datast_id
|
||||
where datast.datainfo.deletedbyinference = false and datast.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.dataset_tmp
|
||||
SELECT substr(d.id, 4) AS id,
|
||||
d.title[0].value AS title,
|
||||
d.publisher.value AS publisher,
|
||||
cast(null AS string) AS journal,
|
||||
d.dateofacceptance.value as date,
|
||||
date_format(d.dateofacceptance.value, 'yyyy') AS year,
|
||||
d.bestaccessright.classname AS bestlicence,
|
||||
d.embargoenddate.value AS embargo_end_date,
|
||||
false AS delayed,
|
||||
size(d.author) AS authors,
|
||||
concat_ws('\u003B', d.source.value) AS source,
|
||||
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
|
||||
'dataset' AS type
|
||||
FROM ${openaire_db_name}.dataset d
|
||||
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_citations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_citations STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
FROM ${openaire_db_name}.dataset d
|
||||
LATERAL VIEW explode(d.extrainfo) citations AS citation
|
||||
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
|
||||
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_classifications purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_classifications STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_concepts purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_concepts STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, case
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||
from ${openaire_db_name}.dataset p
|
||||
LATERAL VIEW explode(p.context) contexts as context
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_datasources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_datasources STORED AS PARQUET AS
|
||||
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
|
@ -82,35 +83,35 @@ FROM (
|
|||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
FROM ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_languages STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, p.language.classname AS language
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_pids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
LATERAL VIEW explode(p.pid) pids AS ppid
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_topics purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_topics STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
FROM ${openaire_db_name}.dataset p
|
||||
LATERAL VIEW explode(p.subject) subjects AS subject
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------
|
||||
--------------------------------------------------------
|
||||
-- Software table/view and Software related tables/views
|
||||
|
@ -5,72 +7,74 @@
|
|||
--------------------------------------------------------
|
||||
|
||||
-- Software temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp purge;
|
||||
CREATE TABLE ${stats_db_name}.software_tmp
|
||||
(
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
date STRING,
|
||||
year STRING,
|
||||
bestlicence STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software stored as parquet as
|
||||
with soft_pr as (
|
||||
select soft.id as soft_id, case when (to_date(soft.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||
from ${openaire_db_name}.software soft
|
||||
join ${openaire_db_name}.relation rel
|
||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=soft.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||
where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false
|
||||
),
|
||||
soft_delayed as (
|
||||
select soft_id, max(delayed) as delayed
|
||||
from soft_pr
|
||||
group by soft_id
|
||||
)
|
||||
clustered by (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
select /*+ COALESCE(100) */
|
||||
substr(soft.id, 4) as id,
|
||||
soft.title[0].value as title,
|
||||
soft.publisher.value as publisher,
|
||||
cast(null as string) as journal,
|
||||
soft.dateofacceptance.value as date,
|
||||
date_format(soft.dateofacceptance.value, 'yyyy') as year,
|
||||
soft.bestaccessright.classname as bestlicence,
|
||||
soft.embargoenddate.value as embargo_end_date,
|
||||
coalesce(soft_delayed.delayed, false) as delayed, -- It's delayed, when the software was published after the end of the project.
|
||||
size(soft.author) as authors,
|
||||
concat_ws('\u003B', soft.source.value) as source,
|
||||
case when size(soft.description) > 0 then true else false end as abstract,
|
||||
'software' as type
|
||||
from ${openaire_db_name}.software soft
|
||||
left outer join soft_delayed on soft.id=soft_delayed.soft_id
|
||||
where soft.datainfo.deletedbyinference = false and soft.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.software_tmp
|
||||
SELECT substr(s.id, 4) as id,
|
||||
s.title[0].value AS title,
|
||||
s.publisher.value AS publisher,
|
||||
CAST(NULL AS string) AS journal,
|
||||
s.dateofacceptance.value AS DATE,
|
||||
date_format(s.dateofacceptance.value, 'yyyy') AS YEAR,
|
||||
s.bestaccessright.classname AS bestlicence,
|
||||
s.embargoenddate.value AS embargo_end_date,
|
||||
FALSE AS delayed,
|
||||
SIZE(s.author) AS authors,
|
||||
concat_ws('\u003B', s.source.value) AS source,
|
||||
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
||||
'software' as type
|
||||
from ${openaire_db_name}.software s
|
||||
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_citations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_citations STORED AS PARQUET AS
|
||||
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
SELECT /*+ COALESCE(100) */ substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
FROM ${openaire_db_name}.software s
|
||||
LATERAL VIEW explode(s.extrainfo) citations as citation
|
||||
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
|
||||
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_classifications purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_classifications STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
FROM ${openaire_db_name}.software p
|
||||
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_concepts purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_concepts STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, case
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||
FROM ${openaire_db_name}.software p
|
||||
LATERAL VIEW explode(p.context) contexts AS context
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_datasources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_datasources STORED AS PARQUET AS
|
||||
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
FROM ${openaire_db_name}.software p
|
||||
|
@ -79,35 +83,35 @@ FROM (
|
|||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
FROM ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_languages STORED AS PARQUET AS
|
||||
select substr(p.id, 4) AS id, p.language.classname AS language
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
|
||||
FROM ${openaire_db_name}.software p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.software p
|
||||
LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_pids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
FROM ${openaire_db_name}.software p
|
||||
LATERAL VIEW explode(p.pid) pids AS ppid
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_topics purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_topics STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
FROM ${openaire_db_name}.software p
|
||||
LATERAL VIEW explode(p.subject) subjects AS subject
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
--------------------------------------------------------------------------------
|
||||
-- Otherresearchproduct table/view and Otherresearchproduct related tables/views
|
||||
|
@ -5,101 +7,103 @@
|
|||
--------------------------------------------------------------------------------
|
||||
|
||||
-- Otherresearchproduct temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp
|
||||
(
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
date STRING,
|
||||
year STRING,
|
||||
bestlicence STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING
|
||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as
|
||||
with other_pr as (
|
||||
select other.id as other_id, case when (to_date(other.dateofacceptance.value) > to_date( pj.enddate.value)) then true else false end as delayed
|
||||
from ${openaire_db_name}.otherresearchproduct other
|
||||
join ${openaire_db_name}.relation rel
|
||||
on reltype = 'resultProject' and relclass = 'isProducedBy' and rel.source=other.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join ${openaire_db_name}.project pj on pj.id=rel.target and pj.datainfo.deletedbyinference = false and pj.datainfo.invisible = false
|
||||
where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false
|
||||
),
|
||||
other_delayed as (
|
||||
select other_id, max(delayed) as delayed
|
||||
from other_pr
|
||||
group by other_id
|
||||
)
|
||||
select /*+ COALESCE(100) */
|
||||
substr(other.id, 4) as id,
|
||||
other.title[0].value as title,
|
||||
other.publisher.value as publisher,
|
||||
cast(null as string) as journal,
|
||||
other.dateofacceptance.value as date,
|
||||
date_format(other.dateofacceptance.value, 'yyyy') as year,
|
||||
other.bestaccessright.classname as bestlicence,
|
||||
other.embargoenddate.value as embargo_end_date,
|
||||
false as delayed,
|
||||
size(other.author) as authors,
|
||||
concat_ws('\u003B', other.source.value) as source,
|
||||
case when size(other.description) > 0 then true else false end as abstract,
|
||||
'other' as type
|
||||
from ${openaire_db_name}.otherresearchproduct other
|
||||
left outer join other_delayed on other.id=other_delayed.other_id
|
||||
where other.datainfo.deletedbyinference = false and other.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp
|
||||
SELECT substr(o.id, 4) AS id,
|
||||
o.title[0].value AS title,
|
||||
o.publisher.value AS publisher,
|
||||
CAST(NULL AS string) AS journal,
|
||||
o.dateofacceptance.value AS DATE,
|
||||
date_format(o.dateofacceptance.value, 'yyyy') AS year,
|
||||
o.bestaccessright.classname AS bestlicence,
|
||||
o.embargoenddate.value as embargo_end_date,
|
||||
FALSE AS delayed,
|
||||
SIZE(o.author) AS authors,
|
||||
concat_ws('\u003B', o.source.value) AS source,
|
||||
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
||||
'other' AS type
|
||||
FROM ${openaire_db_name}.otherresearchproduct o
|
||||
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false;
|
||||
|
||||
-- Otherresearchproduct_citations
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_citations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations STORED AS PARQUET AS
|
||||
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
SELECT /*+ COALESCE(100) */ substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
|
||||
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
|
||||
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
|
||||
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false;
|
||||
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_classifications purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, instancetype.classname AS type
|
||||
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_concepts purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) as id, case
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, case
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id
|
||||
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
|
||||
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
|
||||
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_datasources purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources STORED AS PARQUET AS
|
||||
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
|
||||
SELECT /*+ COALESCE(100) */ p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
|
||||
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
|
||||
LEFT OUTER JOIN(SELECT substr(d.id, 4) id
|
||||
LEFT OUTER JOIN (SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
|
||||
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, p.language.classname AS language
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, p.language.classname AS language
|
||||
FROM ${openaire_db_name}.otherresearchproduct p
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_pids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
|
||||
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_topics purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
|
||||
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; /*EOS*/
|
|
@ -1,110 +1,120 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Project table/view and Project related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project_oids STORED AS PARQUET AS
|
||||
SELECT substr(p.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(p.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
|
||||
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
|
||||
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
|
||||
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
|
||||
SELECT /*+ COALESCE(100) */ substr(r.source, 4) AS id, substr(r.target, 4) AS organization
|
||||
from ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_results purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
|
||||
SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'resultProject' and r.target like '40|%'
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_classification purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.project_classification STORED AS PARQUET as
|
||||
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
|
||||
select /*+ COALESCE(100) */ substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
|
||||
from ${openaire_db_name}.project p
|
||||
lateral view explode(p.h2020classification) classifs as class
|
||||
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null;
|
||||
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_tmp purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project_tmp
|
||||
(
|
||||
id STRING,
|
||||
acronym STRING,
|
||||
title STRING,
|
||||
funder STRING,
|
||||
funding_lvl0 STRING,
|
||||
funding_lvl1 STRING,
|
||||
funding_lvl2 STRING,
|
||||
ec39 STRING,
|
||||
type STRING,
|
||||
startdate STRING,
|
||||
enddate STRING,
|
||||
start_year INT,
|
||||
end_year INT,
|
||||
duration INT,
|
||||
haspubs STRING,
|
||||
numpubs INT,
|
||||
daysforlastpub INT,
|
||||
delayedpubs INT,
|
||||
callidentifier STRING,
|
||||
code STRING,
|
||||
totalcost FLOAT,
|
||||
fundedamount FLOAT,
|
||||
currency STRING
|
||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||
with pr_pub as (
|
||||
select pr.id as pr_id, pub.id as pub_id,
|
||||
(case when datediff(pub.dt_dateofacceptance, pr.dt_enddate) > 0 then true else false end) as delayed,
|
||||
max(datediff(pub.dt_dateofacceptance, pr.dt_enddate)) as daysForlastPub
|
||||
from (select id, to_date(dateofacceptance.value) as dt_dateofacceptance from ${openaire_db_name}.publication
|
||||
where datainfo.deletedbyinference = false and datainfo.invisible = false) pub
|
||||
join ${openaire_db_name}.relation rel
|
||||
on rel.reltype = 'resultProject' and rel.relclass = 'isProducedBy' and rel.source=pub.id
|
||||
and rel.datainfo.deletedbyinference = false and rel.datainfo.invisible = false
|
||||
join (select id, to_date(enddate.value) as dt_enddate from ${openaire_db_name}.project
|
||||
where datainfo.deletedbyinference = false and datainfo.invisible = false) pr
|
||||
on pr.id=rel.target
|
||||
group by pr.id, pub.id, pub.dt_dateofacceptance, pr.dt_enddate
|
||||
),
|
||||
num_pubs_pr as (
|
||||
select pr_id, count( distinct pub_id) as num_pubs
|
||||
from pr_pub
|
||||
group by pr_id
|
||||
),
|
||||
pub_delayed as (
|
||||
select pr_id, pub_id, max(delayed) as delayed
|
||||
from pr_pub
|
||||
group by pr_id, pub_id
|
||||
),
|
||||
num_pub_delayed as (
|
||||
select pr_id, count(distinct pub_id) as num_delayed
|
||||
from pub_delayed
|
||||
where delayed
|
||||
group by pr_id
|
||||
)
|
||||
select /*+ COALESCE(100) */
|
||||
substr(p.id, 4) as id,
|
||||
p.acronym.value as acronym,
|
||||
p.title.value as title,
|
||||
xpath_string(p.fundingtree[0].value, '//funder/name') as funder,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') as funding_lvl0,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') as funding_lvl1,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') as funding_lvl2,
|
||||
p.ecsc39.value as ec39,
|
||||
p.contracttype.classname as type,
|
||||
p.startdate.value as startdate,
|
||||
p.enddate.value as enddate,
|
||||
year(p.startdate.value) as start_year,
|
||||
year(p.enddate.value) as end_year,
|
||||
cast(months_between(p.enddate.value, p.startdate.value) as int) as duration,
|
||||
case when pr_pub.pub_id is null then 'no' else 'yes' end as haspubs,
|
||||
num_pubs_pr.num_pubs as numpubs,
|
||||
pr_pub.daysForlastPub as daysForlastPub,
|
||||
npd.num_delayed as delayedpubs,
|
||||
p.callidentifier.value as callidentifier,
|
||||
p.code.value as code,
|
||||
p.totalcost as totalcost,
|
||||
p.fundedamount as fundedamount,
|
||||
p.currency.value as currency
|
||||
from ${openaire_db_name}.project p
|
||||
left outer join pr_pub on pr_pub.pr_id = p.id
|
||||
left outer join num_pubs_pr on num_pubs_pr.pr_id = p.id
|
||||
left outer join num_pub_delayed npd on npd.pr_id=p.id
|
||||
where p.datainfo.deletedbyinference = false and p.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.project_tmp
|
||||
SELECT substr(p.id, 4) AS id,
|
||||
p.acronym.value AS acronym,
|
||||
p.title.value AS title,
|
||||
xpath_string(p.fundingtree[0].value, '//funder/name') AS funder,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_0/name') AS funding_lvl0,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_1/name') AS funding_lvl1,
|
||||
xpath_string(p.fundingtree[0].value, '//funding_level_2/name') AS funding_lvl2,
|
||||
p.ecsc39.value AS ec39,
|
||||
p.contracttype.classname AS type,
|
||||
p.startdate.value AS startdate,
|
||||
p.enddate.value AS enddate,
|
||||
year(p.startdate.value) AS start_year,
|
||||
year(p.enddate.value) AS end_year,
|
||||
CAST(MONTHS_BETWEEN(p.enddate.value, p.startdate.value) AS INT) AS duration,
|
||||
'no' AS haspubs,
|
||||
0 AS numpubs,
|
||||
0 AS daysforlastpub,
|
||||
0 AS delayedpubs,
|
||||
p.callidentifier.value AS callidentifier,
|
||||
p.code.value AS code,
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
p.currency.value AS currency
|
||||
FROM ${openaire_db_name}.project p
|
||||
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.funder purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.funder purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.funder STORED AS PARQUET as
|
||||
select distinct xpath_string(fund, '//funder/id') as id,
|
||||
select /*+ COALESCE(100) */ distinct xpath_string(fund, '//funder/id') as id,
|
||||
xpath_string(fund, '//funder/name') as name,
|
||||
xpath_string(fund, '//funder/shortname') as shortname,
|
||||
xpath_string(fundingtree[0].value, '//funder/jurisdiction') as country
|
||||
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund;
|
||||
from ${openaire_db_name}.project p lateral view explode(p.fundingtree.value) fundingtree as fund; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organization_contribution purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project_organization_contribution STORED AS PARQUET AS
|
||||
SELECT distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
|
||||
SELECT /*+ COALESCE(100) */ distinct substr(r.source, 4) AS project, substr(r.target, 4) AS organization,
|
||||
properties[0].value contribution, properties[1].value currency
|
||||
from ${openaire_db_name}.relation r
|
||||
LATERAL VIEW explode (r.properties) properties
|
||||
where properties[0].key='contribution' and r.reltype = 'projectOrganization' and r.source like '40|%'
|
||||
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
and properties[0].value>0.0 and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------
|
||||
----------------------------------------------------
|
||||
-- Result table/view and Result related tables/views
|
||||
|
@ -7,16 +9,16 @@
|
|||
-- Views on temporary tables that should be re-created in the end
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result as
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.publication_tmp
|
||||
FROM ${stats_db_name}.publication
|
||||
UNION ALL
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.software_tmp
|
||||
FROM ${stats_db_name}.software
|
||||
UNION ALL
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.dataset_tmp
|
||||
FROM ${stats_db_name}.dataset
|
||||
UNION ALL
|
||||
SELECT *, bestlicence AS access_mode
|
||||
FROM ${stats_db_name}.otherresearchproduct_tmp;
|
||||
FROM ${stats_db_name}.otherresearchproduct; /*EOS*/
|
||||
|
||||
-- Views on final tables
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_datasources AS
|
||||
|
@ -30,7 +32,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_datasources
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_datasources;
|
||||
FROM ${stats_db_name}.otherresearchproduct_datasources; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_citations AS
|
||||
SELECT *
|
||||
|
@ -43,7 +45,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_citations
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_citations;
|
||||
FROM ${stats_db_name}.otherresearchproduct_citations; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_classifications AS
|
||||
SELECT *
|
||||
|
@ -56,7 +58,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_classifications
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_classifications;
|
||||
FROM ${stats_db_name}.otherresearchproduct_classifications; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_concepts AS
|
||||
SELECT *
|
||||
|
@ -69,7 +71,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_concepts
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_concepts;
|
||||
FROM ${stats_db_name}.otherresearchproduct_concepts; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_languages AS
|
||||
SELECT *
|
||||
|
@ -82,7 +84,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_languages
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_languages;
|
||||
FROM ${stats_db_name}.otherresearchproduct_languages; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_oids AS
|
||||
SELECT *
|
||||
|
@ -95,7 +97,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_oids
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_oids;
|
||||
FROM ${stats_db_name}.otherresearchproduct_oids; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS
|
||||
SELECT *
|
||||
|
@ -108,7 +110,7 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_pids
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_pids;
|
||||
FROM ${stats_db_name}.otherresearchproduct_pids; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS
|
||||
SELECT *
|
||||
|
@ -121,37 +123,44 @@ SELECT *
|
|||
FROM ${stats_db_name}.dataset_topics
|
||||
UNION ALL
|
||||
SELECT *
|
||||
FROM ${stats_db_name}.otherresearchproduct_topics;
|
||||
FROM ${stats_db_name}.otherresearchproduct_topics; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.result_fos_base_tmp stored as parquet as
|
||||
select /*+ COALESCE(100) */ id, topic from ${stats_db_name}.result_topics where type='Fields of Science and Technology classification'; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_fos purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.result_fos stored as parquet as
|
||||
with
|
||||
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
|
||||
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
|
||||
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'),
|
||||
lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification')
|
||||
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
|
||||
lvl1 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '__ %'),
|
||||
lvl2 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '____ %'),
|
||||
lvl3 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '______ %'),
|
||||
lvl4 as (select * from ${stats_db_name}.result_fos_base_tmp where topic like '________ %')
|
||||
select /*+ COALESCE(100) */ lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4
|
||||
from lvl1
|
||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6);
|
||||
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
|
||||
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4)
|
||||
join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); /*EOS*/
|
||||
|
||||
DROP TABLE ${stats_db_name}.result_fos_base_tmp purge; /*EOS*/
|
||||
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'resultOrganization'
|
||||
and r.target like '50|%'
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
|
||||
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projects purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS
|
||||
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
|
||||
select /*+ COALESCE(100) */ pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance
|
||||
FROM ${stats_db_name}.result r
|
||||
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
|
||||
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;
|
||||
|
||||
JOIN ${stats_db_name}.project p ON p.id = pr.id; /*EOS*/
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
-- noinspection SqlNoDataSourceInspectionForFile
|
||||
|
||||
------------------------------------------------------------
|
||||
|
@ -5,108 +7,73 @@
|
|||
-- Datasource table/view and Datasource related tables/views
|
||||
------------------------------------------------------------
|
||||
------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_tmp purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge; /*EOS*/
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.harested_datasources purge; /*EOS*/
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.piwik_datasource purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_tmp
|
||||
(
|
||||
`id` string,
|
||||
`name` STRING,
|
||||
`type` STRING,
|
||||
`dateofvalidation` STRING,
|
||||
`yearofvalidation` string,
|
||||
`harvested` BOOLEAN,
|
||||
`piwik_id` INT,
|
||||
`latitude` STRING,
|
||||
`longitude` STRING,
|
||||
`websiteurl` STRING,
|
||||
`compatibility` STRING,
|
||||
issn_printed STRING,
|
||||
issn_online STRING
|
||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
create table ${stats_db_name}.harested_datasources stored as parquet as
|
||||
select distinct inst.hostedby.key as d_id
|
||||
from ${openaire_db_name}.result lateral view outer explode (instance) insts as inst; /*EOS*/
|
||||
|
||||
-- Insert statement that takes into account the piwik_id of the openAIRE graph
|
||||
INSERT INTO ${stats_db_name}.datasource_tmp
|
||||
SELECT substr(d1.id, 4) AS id,
|
||||
officialname.value AS name,
|
||||
datasourcetype.classname AS type,
|
||||
dateofvalidation.value AS dateofvalidation,
|
||||
date_format(d1.dateofvalidation.value, 'yyyy') AS yearofvalidation,
|
||||
FALSE AS harvested,
|
||||
CASE WHEN d2.piwik_id IS NULL THEN 0 ELSE d2.piwik_id END AS piwik_id,
|
||||
d1.latitude.value AS latitude,
|
||||
d1.longitude.value AS longitude,
|
||||
d1.websiteurl.value AS websiteurl,
|
||||
d1.openairecompatibility.classid AS compatibility,
|
||||
d1.journal.issnprinted AS issn_printed,
|
||||
d1.journal.issnonline AS issn_online
|
||||
FROM ${openaire_db_name}.datasource d1
|
||||
LEFT OUTER JOIN
|
||||
(SELECT id, split(originalidd, '\\:')[1] as piwik_id
|
||||
FROM ${openaire_db_name}.datasource
|
||||
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
|
||||
WHERE originalidd like "piwik:%") AS d2
|
||||
ON d1.id = d2.id
|
||||
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false;
|
||||
create table ${stats_db_name}.piwik_datasource stored as parquet as
|
||||
select id, split(originalidd, '\\:')[1] as piwik_id
|
||||
from ${openaire_db_name}.datasource
|
||||
lateral view explode(originalid) temp as originalidd
|
||||
where originalidd like "piwik:%"; /*EOS*/
|
||||
|
||||
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
|
||||
-- Creating a temporary dual table that will be removed after the following insert
|
||||
CREATE TABLE ${stats_db_name}.datasource stored as parquet as
|
||||
select /*+ COALESCE(100) */
|
||||
substr(dtrce.id, 4) as id,
|
||||
case when dtrce.officialname.value='Unknown Repository' then 'Other' else dtrce.officialname.value end as name,
|
||||
dtrce.datasourcetype.classname as type,
|
||||
dtrce.dateofvalidation.value as dateofvalidation,
|
||||
case when dtrce.dateofvalidation.value='-1' then null else date_format(dtrce.dateofvalidation.value, 'yyyy') end as yearofvalidation,
|
||||
case when res.d_id is null then false else true end as harvested,
|
||||
case when piwik_d.piwik_id is null then 0 else piwik_d.piwik_id end as piwik_id,
|
||||
dtrce.latitude.value as latitude,
|
||||
dtrce.longitude.value as longitude,
|
||||
dtrce.websiteurl.value as websiteurl,
|
||||
dtrce.openairecompatibility.classid as compatibility,
|
||||
dtrce.journal.issnprinted as issn_printed,
|
||||
dtrce.journal.issnonline as issn_online
|
||||
from ${openaire_db_name}.datasource dtrce
|
||||
left outer join ${stats_db_name}.harested_datasources res on res.d_id=dtrce.id
|
||||
left outer join ${stats_db_name}.piwik_datasource piwik_d on piwik_d.id=dtrce.id
|
||||
where dtrce.datainfo.deletedbyinference = false and dtrce.datainfo.invisible = false; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dual ( dummy CHAR(1));
|
||||
drop table ${stats_db_name}.harested_datasources; /*EOS*/
|
||||
drop table ${stats_db_name}.piwik_datasource; /*EOS*/
|
||||
|
||||
INSERT INTO ${stats_db_name}.dual VALUES ('X');
|
||||
|
||||
INSERT INTO ${stats_db_name}.datasource_tmp (`id`, `name`, `type`, `dateofvalidation`, `yearofvalidation`, `harvested`,
|
||||
`piwik_id`, `latitude`, `longitude`, `websiteurl`, `compatibility`, `issn_printed`, `issn_online`)
|
||||
SELECT 'other',
|
||||
'Other',
|
||||
'Repository',
|
||||
NULL,
|
||||
NULL,
|
||||
false,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
'unknown',
|
||||
null,
|
||||
null
|
||||
FROM ${stats_db_name}.dual
|
||||
WHERE 'other' not in (SELECT id FROM ${stats_db_name}.datasource_tmp WHERE name = 'Unknown Repository');
|
||||
DROP TABLE ${stats_db_name}.dual;
|
||||
|
||||
UPDATE ${stats_db_name}.datasource_tmp SET name='Other' WHERE name = 'Unknown Repository';
|
||||
UPDATE ${stats_db_name}.datasource_tmp SET yearofvalidation=null WHERE yearofvalidation = '-1';
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_languages purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_languages STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, langs.languages AS language
|
||||
SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, langs.languages AS language
|
||||
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_oids STORED AS PARQUET AS
|
||||
SELECT substr(d.id, 4) AS id, oids.ids AS oid
|
||||
SELECT /*+ COALESCE(100) */ substr(d.id, 4) AS id, oids.ids AS oid
|
||||
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
|
||||
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations purge; /*EOS*/
|
||||
|
||||
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
|
||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
SELECT /*+ COALESCE(100) */ substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||
FROM ${openaire_db_name}.relation r
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false;
|
||||
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_sources purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.datasource_sources STORED AS PARQUET AS
|
||||
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
||||
select /*+ COALESCE(100) */ substr(d.id, 4) as id, substr(cf.key, 4) as datasource
|
||||
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
|
||||
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
|
||||
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
|
||||
SELECT datasource AS id, id AS result
|
||||
FROM ${stats_db_name}.result_datasources;
|
||||
FROM ${stats_db_name}.result_datasources; /*EOS*/
|
||||
|
|
|
@ -1,22 +1,24 @@
|
|||
set mapred.job.queue.name=analytics; /*EOS*/
|
||||
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
-- Organization table/view and Organization related tables/views
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization purge;
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization purge; /*EOS*/
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization STORED AS PARQUET AS
|
||||
SELECT substr(o.id, 4) as id,
|
||||
SELECT /*+ COALESCE(100) */ substr(o.id, 4) as id,
|
||||
o.legalname.value as name,
|
||||
o.legalshortname.value as legalshortname,
|
||||
o.country.classid as country
|
||||
FROM ${openaire_db_name}.organization o
|
||||
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE;
|
||||
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
|
||||
SELECT organization AS id, id AS datasource
|
||||
FROM ${stats_db_name}.datasource_organizations;
|
||||
FROM ${stats_db_name}.datasource_organizations; /*EOS*/
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS
|
||||
SELECT id AS project, organization as id
|
||||
FROM ${stats_db_name}.project_organizations;
|
||||
FROM ${stats_db_name}.project_organizations; /*EOS*/
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue