forked from D-Net/dnet-hadoop
Compare commits
91 Commits
df15a4dc9f
...
2f61054cd1
Author | SHA1 | Date |
---|---|---|
Claudio Atzori | 2f61054cd1 | |
Claudio Atzori | 83c90c7180 | |
Alessia Bardi | c48c43fa9e | |
Alessia Bardi | 8d3b60f446 | |
miconis | 611ca511db | |
miconis | 9646b9fd98 | |
Sandro La Bruzzo | 2557bb41f5 | |
Sandro La Bruzzo | b84e0cabeb | |
Sandro La Bruzzo | f258bbb927 | |
Sandro La Bruzzo | 991b06bd0b | |
Claudio Atzori | cb7efe12ac | |
dimitrispie | 3f25d2efb2 | |
dimitrispie | 13687fd887 | |
Claudio Atzori | 60a6a9a583 | |
Sandro La Bruzzo | 66702b1973 | |
Sandro La Bruzzo | 477cb10715 | |
Sandro La Bruzzo | be79d74e3d | |
Claudio Atzori | 35619b93ee | |
Claudio Atzori | 474117c2e8 | |
Claudio Atzori | a53acfbc06 | |
Alessia Bardi | b924276e18 | |
Antonis Lempesis | a1e1cf32d7 | |
Antonis Lempesis | f358cabb2b | |
Claudio Atzori | 7fa60e166e | |
Antonis Lempesis | 421d55265d | |
miconis | 853333bdde | |
Antonis Lempesis | 8b681dcf1b | |
Antonis Lempesis | 2943287d10 | |
Antonis Lempesis | dd2329849f | |
Antonis Lempesis | de9bf3a161 | |
Antonis Lempesis | 9b1936701c | |
Antonis Lempesis | 8fc89ae822 | |
Antonis Lempesis | 461bf90ca6 | |
Antonis Lempesis | 43852bac0e | |
Antonis Lempesis | f13cca7e83 | |
Antonis Lempesis | c6ada217a1 | |
Antonis Lempesis | 1250ae197f | |
Antonis Lempesis | ccee451dde | |
Sandro La Bruzzo | 370dddb2fa | |
Claudio Atzori | d64a942a76 | |
Claudio Atzori | a45b95ccc1 | |
Sandro La Bruzzo | 74afe43c3a | |
Claudio Atzori | 11e26c020a | |
Claudio Atzori | 5219d56be5 | |
Claudio Atzori | 4f78565c04 | |
Claudio Atzori | a6a38cca9e | |
Miriam Baglioni | 9bc4fd3b69 | |
Miriam Baglioni | 2fc89fc9b5 | |
Claudio Atzori | 081fe92a21 | |
Claudio Atzori | 576693d782 | |
Claudio Atzori | 6e3554a45e | |
Claudio Atzori | e725c88ebb | |
Claudio Atzori | f83dd70e1c | |
Claudio Atzori | 5f7330d407 | |
Claudio Atzori | 1923c1ce21 | |
Claudio Atzori | a9961a1835 | |
Alessia Bardi | 9594343725 | |
Claudio Atzori | d267dce520 | |
Claudio Atzori | 998b66855a | |
Claudio Atzori | 5b6844b969 | |
Claudio Atzori | ffdb2a3ea3 | |
Alessia Bardi | 9069958479 | |
Claudio Atzori | 77e8c6c7f7 | |
Claudio Atzori | 5947cddafc | |
Miriam Baglioni | 13cf444f85 | |
Claudio Atzori | 5e5f65a3c3 | |
Claudio Atzori | 9913b6073c | |
Enrico Ottonello | 2dc50c0999 | |
Enrico Ottonello | 66604bb2b4 | |
Enrico Ottonello | 7840cc6526 | |
Enrico Ottonello | a65667d217 | |
Sandro La Bruzzo | 10068c00ea | |
Miriam Baglioni | 1cdd09cd8e | |
Sandro La Bruzzo | 4cb65bc64a | |
Claudio Atzori | 734de62474 | |
Claudio Atzori | fa720c1da4 | |
Claudio Atzori | 9629569e22 | |
Claudio Atzori | f13e11e3f7 | |
Miriam Baglioni | f5486ffb14 | |
Claudio Atzori | e0061232e9 | |
Claudio Atzori | 28a66af425 | |
Claudio Atzori | 783988af06 | |
Enrico Ottonello | abdd0ade1f | |
Enrico Ottonello | d0945c3c78 | |
Enrico Ottonello | 1265dadc90 | |
Enrico Ottonello | 0821d8e97d | |
Enrico Ottonello | ae7bd24d79 | |
Enrico Ottonello | 4d6c473bf1 | |
Claudio Atzori | ea9b00ce56 | |
Claudio Atzori | 2e70aa43f0 | |
Enrico Ottonello | e13926cdd0 |
|
@ -0,0 +1,53 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.LocalFileSystem
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.{Date, Locale}
|
||||
import scala.io.Source
|
||||
|
||||
object SparkDownloadUpdateDatacite {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
||||
val hdfsuri = parser.get("namenode")
|
||||
log.info(s"namenode is $hdfsuri")
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val maxDate:String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val string_to_date =ISO8601FORMAT.parse(maxDate)
|
||||
val ts = string_to_date.getTime
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -4,6 +4,10 @@
|
|||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oafTargetPath</name>
|
||||
<description>the target path where the OAF records are stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
|
@ -13,15 +17,26 @@
|
|||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>exportLinks</name>
|
||||
<value>false</value>
|
||||
<description>instructs the transformation phase to produce the links or not</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ImportDatacite"/>
|
||||
<start to="resume_from"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="TransformDatacite">${wf:conf('resumeFrom') eq 'TransformDatacite'}</case>
|
||||
<default to="ImportDatacite"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
@ -45,12 +60,11 @@
|
|||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="TransformJob"/>
|
||||
<ok to="TransformDatacite"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="TransformJob">
|
||||
<action name="TransformDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
|
@ -68,9 +82,9 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_oaf</arg>
|
||||
<arg>--targetPath</arg><arg>${oafTargetPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>false</arg>
|
||||
<arg>--exportLinks</arg><arg>${exportLinks}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkWhitelistSimRels extends AbstractSparkAction {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class);
|
||||
|
||||
private static final String WHITELIST_SEPARATOR = "####";
|
||||
|
||||
public SparkWhitelistSimRels(ArgumentApplicationParser parser, SparkSession spark) {
|
||||
super(parser, spark);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
new SparkWhitelistSimRels(parser, getSparkSession(conf))
|
||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(ISLookUpService isLookUpService)
|
||||
throws DocumentException, IOException, ISLookUpException, SAXException {
|
||||
|
||||
// read oozie parameters
|
||||
final String graphBasePath = parser.get("graphBasePath");
|
||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||
final String actionSetId = parser.get("actionSetId");
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final int numPartitions = Optional
|
||||
.ofNullable(parser.get("numPartitions"))
|
||||
.map(Integer::valueOf)
|
||||
.orElse(NUM_PARTITIONS);
|
||||
final String whiteListPath = parser.get("whiteListPath");
|
||||
|
||||
log.info("numPartitions: '{}'", numPartitions);
|
||||
log.info("graphBasePath: '{}'", graphBasePath);
|
||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||
log.info("actionSetId: '{}'", actionSetId);
|
||||
log.info("workingPath: '{}'", workingPath);
|
||||
log.info("whiteListPath: '{}'", whiteListPath);
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
// file format: source####target
|
||||
Dataset<Tuple2<String, String>> whiteListRels = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(whiteListPath)
|
||||
// check if the line is in the correct format: id1####id2
|
||||
.filter(s -> s.contains(WHITELIST_SEPARATOR) && s.split(WHITELIST_SEPARATOR).length == 2)
|
||||
.map(s -> new Tuple2<>(s.split(WHITELIST_SEPARATOR)[0], s.split(WHITELIST_SEPARATOR)[1]))
|
||||
.rdd(),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||
|
||||
// for each dedup configuration
|
||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||
|
||||
final String entity = dedupConf.getWf().getEntityType();
|
||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||
log.info("Adding whitelist simrels for: '{}'", subEntity);
|
||||
|
||||
final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity);
|
||||
|
||||
Dataset<Tuple2<String, String>> entities = spark
|
||||
.createDataset(
|
||||
sc
|
||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||
.repartition(numPartitions)
|
||||
.mapToPair(
|
||||
(PairFunction<String, String, String>) s -> {
|
||||
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
||||
return new Tuple2<>(d.getIdentifier(), "present");
|
||||
})
|
||||
.rdd(),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||
|
||||
Dataset<Tuple2<String, String>> whiteListRels1 = whiteListRels
|
||||
.joinWith(entities, whiteListRels.col("_1").equalTo(entities.col("_1")), "inner")
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, String>>, Tuple2<String, String>>) Tuple2::_1,
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||
|
||||
Dataset<Tuple2<String, String>> whiteListRels2 = whiteListRels1
|
||||
.joinWith(entities, whiteListRels1.col("_2").equalTo(entities.col("_1")), "inner")
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, String>>, Tuple2<String, String>>) Tuple2::_1,
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||
|
||||
Dataset<Relation> whiteListSimRels = whiteListRels2
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, String>, Relation>) r -> createSimRel(r._1(), r._2(), entity),
|
||||
Encoders.bean(Relation.class));
|
||||
|
||||
saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
|
||||
}
|
||||
}
|
||||
|
||||
private Relation createSimRel(String source, String target, String entity) {
|
||||
final Relation r = new Relation();
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setSubRelType("dedupSimilarity");
|
||||
r.setRelClass("isSimilarTo");
|
||||
r.setDataInfo(new DataInfo());
|
||||
|
||||
switch (entity) {
|
||||
case "result":
|
||||
r.setRelType("resultResult");
|
||||
break;
|
||||
case "organization":
|
||||
r.setRelType("organizationOrganization");
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class UpdateOpenorgsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(UpdateOpenorgsJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateSimRels.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateOpenorgsJob_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String apiUrl = parser.get("apiUrl");
|
||||
final int delay = Integer.parseInt(parser.get("delay"));
|
||||
|
||||
log.info("apiUrl: '{}'", apiUrl);
|
||||
log.info("delay: '{}'", delay);
|
||||
|
||||
APIResponse res = httpCall(apiUrl);
|
||||
while (res != null && res.getStatus().equals(ImportStatus.RUNNING)) {
|
||||
TimeUnit.MINUTES.sleep(delay);
|
||||
res = httpCall(apiUrl + "/status");
|
||||
}
|
||||
|
||||
if (res == null) {
|
||||
log.error("Openorgs Update FAILED: No response");
|
||||
throw new RuntimeException("Openorgs Update FAILED: No response");
|
||||
}
|
||||
|
||||
if (res.getStatus() == null || !res.getStatus().equals(ImportStatus.SUCCESS)) {
|
||||
log.error("Openorgs Update FAILED: '{}' - '{}'", res.getStatus(), res.getMessage());
|
||||
throw new RuntimeException(res.getMessage());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static APIResponse httpCall(final String url) throws Exception {
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String s = IOUtils.toString(response.getEntity().getContent());
|
||||
return (new ObjectMapper()).readValue(s, APIResponse.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class APIResponse {
|
||||
private String id;
|
||||
private Long dateStart;
|
||||
private Long dateEnd;
|
||||
private ImportStatus status;
|
||||
private String message;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Long getDateStart() {
|
||||
return dateStart;
|
||||
}
|
||||
|
||||
public void setDateStart(Long dateStart) {
|
||||
this.dateStart = dateStart;
|
||||
}
|
||||
|
||||
public Long getDateEnd() {
|
||||
return dateEnd;
|
||||
}
|
||||
|
||||
public void setDateEnd(Long dateEnd) {
|
||||
this.dateEnd = dateEnd;
|
||||
}
|
||||
|
||||
public ImportStatus getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(ImportStatus status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
return message;
|
||||
}
|
||||
|
||||
public void setMessage(String message) {
|
||||
this.message = message;
|
||||
}
|
||||
}
|
||||
|
||||
enum ImportStatus {
|
||||
SUCCESS, FAILED, RUNNING, NOT_LAUNCHED, NOT_YET_STARTED
|
||||
}
|
|
@ -28,6 +28,11 @@
|
|||
<name>dbPwd</name>
|
||||
<description>password to access the OpenOrgs database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dbConnections</name>
|
||||
<value>10</value>
|
||||
<description>number of connections to the postgres db</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>path for the working directory</description>
|
||||
|
@ -223,7 +228,7 @@
|
|||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
||||
<arg>--numConnections</arg><arg>20</arg>
|
||||
<arg>--numConnections</arg><arg>${dbConnections}</arg>
|
||||
</spark>
|
||||
<ok to="PrepareNewOrgs"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -254,19 +259,24 @@
|
|||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
||||
<arg>--numConnections</arg><arg>20</arg>
|
||||
<arg>--numConnections</arg><arg>${dbConnections}</arg>
|
||||
</spark>
|
||||
<ok to="update_openorgs"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="update_openorgs">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>/usr/bin/curl</exec>
|
||||
<argument>${apiUrl}</argument>
|
||||
</shell>
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.dedup.UpdateOpenorgsJob</main-class>
|
||||
<arg>--apiUrl</arg><arg>${apiUrl}</arg>
|
||||
<arg>--delay</arg><arg>5</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -20,6 +20,10 @@
|
|||
<name>workingPath</name>
|
||||
<description>path for the working directory</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>whiteListPath</name>
|
||||
<description>path for the whitelist of similarity relations</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupGraphPath</name>
|
||||
<description>path for the output graph</description>
|
||||
|
@ -130,6 +134,34 @@
|
|||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--numPartitions</arg><arg>8000</arg>
|
||||
</spark>
|
||||
<ok to="WhitelistSimRels"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="WhitelistSimRels">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Add Whitelist Similarity Relations</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--whiteListPath</arg><arg>${whiteListPath}</arg>
|
||||
<arg>--numPartitions</arg><arg>8000</arg>
|
||||
</spark>
|
||||
<ok to="CreateMergeRel"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "api",
|
||||
"paramLongName": "apiUrl",
|
||||
"paramDescription": "the url of the API",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "delay",
|
||||
"paramDescription": "delay for the HTTP call in minutes",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"paramName": "la",
|
||||
"paramLongName": "isLookUpUrl",
|
||||
"paramDescription": "address for the LookUp",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "asi",
|
||||
"paramLongName": "actionSetId",
|
||||
"paramDescription": "action set identifier (name of the orchestrator)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "graphBasePath",
|
||||
"paramDescription": "the base path of the raw graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "path of the working directory",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "np",
|
||||
"paramLongName": "numPartitions",
|
||||
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "wl",
|
||||
"paramLongName": "whiteListPath",
|
||||
"paramDescription": "whitelist file path for the addition of custom simrels",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -5,13 +5,16 @@ import static java.nio.file.Files.createTempDirectory;
|
|||
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -55,6 +58,10 @@ public class SparkDedupTest implements Serializable {
|
|||
private static String testOutputBasePath;
|
||||
private static String testDedupGraphBasePath;
|
||||
private static final String testActionSetId = "test-orchestrator";
|
||||
private static String whitelistPath;
|
||||
private static List<String> whiteList;
|
||||
|
||||
private static String WHITELIST_SEPARATOR = "####";
|
||||
|
||||
@BeforeAll
|
||||
public static void cleanUp() throws IOException, URISyntaxException {
|
||||
|
@ -71,6 +78,12 @@ public class SparkDedupTest implements Serializable {
|
|||
.toAbsolutePath()
|
||||
.toString();
|
||||
|
||||
whitelistPath = Paths
|
||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/whitelist.simrels.txt").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
whiteList = IOUtils.readLines(new FileReader(whitelistPath));
|
||||
|
||||
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||
|
||||
|
@ -202,6 +215,84 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
@Test
|
||||
@Order(2)
|
||||
void whitelistSimRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkWhitelistSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
|
||||
|
||||
parser
|
||||
.parseArgument(
|
||||
new String[] {
|
||||
"-i", testGraphBasePath,
|
||||
"-asi", testActionSetId,
|
||||
"-la", "lookupurl",
|
||||
"-w", testOutputBasePath,
|
||||
"-np", "50",
|
||||
"-wl", whitelistPath
|
||||
});
|
||||
|
||||
new SparkWhitelistSimRels(parser, spark).run(isLookUpService);
|
||||
|
||||
long orgs_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
long pubs_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "publication"))
|
||||
.count();
|
||||
|
||||
long ds_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "dataset"))
|
||||
.count();
|
||||
|
||||
long orp_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct"))
|
||||
.count();
|
||||
|
||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||
assertEquals(3082, orgs_simrel);
|
||||
assertEquals(7036, pubs_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6750, orp_simrel);
|
||||
|
||||
// entities simrels to be different from the number of previous step (new simrels in the whitelist)
|
||||
Dataset<Row> sw_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "software"));
|
||||
|
||||
// check if the first relation in the whitelist exists
|
||||
assertTrue(
|
||||
sw_simrel
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.toJavaRDD()
|
||||
.filter(
|
||||
rel -> rel.getSource().equalsIgnoreCase(whiteList.get(0).split(WHITELIST_SEPARATOR)[0])
|
||||
&& rel.getTarget().equalsIgnoreCase(whiteList.get(0).split(WHITELIST_SEPARATOR)[1]))
|
||||
.count() > 0);
|
||||
// check if the second relation in the whitelist exists
|
||||
assertTrue(
|
||||
sw_simrel
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.toJavaRDD()
|
||||
.filter(
|
||||
rel -> rel.getSource().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[0])
|
||||
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
||||
.count() > 0);
|
||||
|
||||
assertEquals(338, sw_simrel.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(3)
|
||||
void cutMergeRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -297,7 +388,7 @@ public class SparkDedupTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Order(3)
|
||||
@Order(4)
|
||||
void createMergeRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -353,7 +444,7 @@ public class SparkDedupTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Order(4)
|
||||
@Order(5)
|
||||
void createDedupRecordTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -394,13 +485,13 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
assertEquals(85, orgs_deduprecord);
|
||||
assertEquals(65, pubs_deduprecord);
|
||||
assertEquals(51, sw_deduprecord);
|
||||
assertEquals(49, sw_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
assertEquals(89, orp_deduprecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(5)
|
||||
@Order(6)
|
||||
void updateEntityTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -479,7 +570,7 @@ public class SparkDedupTest implements Serializable {
|
|||
assertEquals(838, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
assertEquals(200, softwares);
|
||||
assertEquals(198, softwares);
|
||||
assertEquals(389, dataset);
|
||||
assertEquals(517, otherresearchproduct);
|
||||
|
||||
|
@ -516,7 +607,7 @@ public class SparkDedupTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Order(6)
|
||||
@Order(7)
|
||||
void propagateRelationTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -566,7 +657,7 @@ public class SparkDedupTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
@Order(7)
|
||||
@Order(8)
|
||||
void testRelations() throws Exception {
|
||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
||||
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
50|r37b0ad08687::f645b9729d1e1025a72c57883f0f2cac####50|r37b0ad08687::4c55b436743b5c49fa32cd582fd9e1aa
|
||||
50|datacite____::a90f49f9fde5393c00633bea6e4e374a####50|datacite____::5f55cdee77303ba8a2bf9996c32a330c
|
|
@ -13,10 +13,30 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString,JArray}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
object SparkGenerateDoiBoost {
|
||||
|
||||
|
||||
def extractIdGRID(input:String):List[(String,String)] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
|
||||
val id:String = (json \ "id").extract[String]
|
||||
|
||||
val grids:List[String] = for {
|
||||
|
||||
JObject(pid) <- json \ "pid"
|
||||
JField("qualifier", JObject(qualifier)) <- pid
|
||||
JField("classid", JString(classid)) <-qualifier
|
||||
JField("value", JString(vl)) <- pid
|
||||
if classid == "GRID"
|
||||
} yield vl
|
||||
grids.map(g => (id, s"unresolved::grid::${g.toLowerCase}"))(collection.breakOut)
|
||||
}
|
||||
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
@ -36,6 +56,7 @@ object SparkGenerateDoiBoost {
|
|||
|
||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||
val workingDirPath = parser.get("workingPath")
|
||||
val openaireOrganizationPath = parser.get("openaireOrganizationPath")
|
||||
|
||||
val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable {
|
||||
override def zero: Publication = new Publication
|
||||
|
@ -156,7 +177,7 @@ object SparkGenerateDoiBoost {
|
|||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
||||
val pub:Publication = item._1._2
|
||||
val affiliation = item._2
|
||||
val affId:String = if (affiliation.GridId.isDefined) DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId.get) else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||
val affId:String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||
val r:Relation = new Relation
|
||||
r.setSource(pub.getId)
|
||||
r.setTarget(affId)
|
||||
|
@ -174,9 +195,35 @@ object SparkGenerateDoiBoost {
|
|||
r1.setDataInfo(pub.getDataInfo)
|
||||
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
List(r, r1)
|
||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
|
||||
|
||||
|
||||
|
||||
|
||||
val unresolvedRels:Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
|
||||
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
(r.getSource, r)
|
||||
else if (r.getTarget.startsWith("unresolved"))
|
||||
(r.getTarget,r)
|
||||
else
|
||||
("resolved", r)
|
||||
})
|
||||
|
||||
val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2)
|
||||
|
||||
unresolvedRels.joinWith(openaireOrganization,unresolvedRels("_1").equalTo(openaireOrganization("_2")))
|
||||
.map { x =>
|
||||
val currentRels = x._1._2
|
||||
val currentOrgs = x._2
|
||||
if (currentOrgs!= null)
|
||||
if(currentRels.getSource.startsWith("unresolved"))
|
||||
currentRels.setSource(currentOrgs._1)
|
||||
else
|
||||
currentRels.setTarget(currentOrgs._1)
|
||||
currentRels
|
||||
}.write.save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
|
||||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
|
||||
val affiliation = item._2
|
||||
if (affiliation.GridId.isEmpty) {
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
[
|
||||
{"paramName": "m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true},
|
||||
{"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
|
||||
{"paramName": "oo", "paramLongName":"openaireOrganizationPath", "paramDescription": "the openaire Organization Path", "paramRequired": true},
|
||||
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
||||
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
||||
{"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
{"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
]
|
||||
|
|
|
@ -27,6 +27,12 @@
|
|||
<name>hostedByMapPath</name>
|
||||
<description>the hostedByMap Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>openaireOrganizationPath</name>
|
||||
<description>the OpenAire Organizations Path</description>
|
||||
</property>
|
||||
|
||||
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the Path of the sequence file action set</description>
|
||||
|
@ -214,6 +220,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||
<arg>--openaireOrganizationPath</arg><arg>${openaireOrganizationPath}</arg>
|
||||
<arg>--affiliationPath</arg><arg>${inputPathMAG}/dataset/Affiliations</arg>
|
||||
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/dataset/PaperAuthorAffiliations</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
|
|
|
@ -69,7 +69,7 @@ public class PropagationConstant {
|
|||
PROPAGATION_DATA_INFO_TYPE,
|
||||
PROPAGATION_COUNTRY_INSTREPO_CLASS_ID,
|
||||
PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
return nc;
|
||||
}
|
||||
|
||||
|
@ -84,7 +84,8 @@ public class PropagationConstant {
|
|||
return di;
|
||||
}
|
||||
|
||||
public static Qualifier getQualifier(String inference_class_id, String inference_class_name, String qualifierSchema) {
|
||||
public static Qualifier getQualifier(String inference_class_id, String inference_class_name,
|
||||
String qualifierSchema) {
|
||||
Qualifier pa = new Qualifier();
|
||||
pa.setClassid(inference_class_id);
|
||||
pa.setClassname(inference_class_name);
|
||||
|
@ -108,7 +109,11 @@ public class PropagationConstant {
|
|||
r.setRelClass(rel_class);
|
||||
r.setRelType(rel_type);
|
||||
r.setSubRelType(subrel_type);
|
||||
r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name, ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
r
|
||||
.setDataInfo(
|
||||
getDataInfo(
|
||||
inference_provenance, inference_class_id, inference_class_name,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
|
@ -173,14 +173,17 @@ public class SparkOrcidToResultFromSemRelJob {
|
|||
if (toaddpid) {
|
||||
StructuredProperty p = new StructuredProperty();
|
||||
p.setValue(autoritative_author.getOrcid());
|
||||
p.setQualifier(getQualifier(ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES));
|
||||
p
|
||||
.setQualifier(
|
||||
getQualifier(
|
||||
ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES));
|
||||
p
|
||||
.setDataInfo(
|
||||
getDataInfo(
|
||||
PROPAGATION_DATA_INFO_TYPE,
|
||||
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID,
|
||||
PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
|
||||
Optional<List<StructuredProperty>> authorPid = Optional.ofNullable(author.getPid());
|
||||
if (authorPid.isPresent()) {
|
||||
|
|
|
@ -10,7 +10,6 @@ import java.util.List;
|
|||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
@ -22,6 +21,7 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import scala.Tuple2;
|
||||
|
@ -130,7 +130,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
|||
PROPAGATION_DATA_INFO_TYPE,
|
||||
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID,
|
||||
PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS)));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS)));
|
||||
propagatedContexts.add(newContext);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
|||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
@ -20,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -126,7 +126,7 @@ public class SparkResultToCommunityThroughSemRelJob {
|
|||
PROPAGATION_DATA_INFO_TYPE,
|
||||
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID,
|
||||
PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS)));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS)));
|
||||
return newContext;
|
||||
}
|
||||
return null;
|
||||
|
|
|
@ -6,18 +6,130 @@ import eu.dnetlib.dhp.schema.oaf.Result
|
|||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.HttpGet
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.io.InputStream
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
|
||||
def requestBaseLineUpdatePage(maxFile:String):List[(String,String)] = {
|
||||
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
|
||||
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
if (start>= 0 && end >start)
|
||||
l.substring(start+9, (end-start))
|
||||
else
|
||||
""
|
||||
}.filter(s =>s.endsWith(".gz") ).filter(s => s > maxFile).map(s => (s,s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def downloadBaselinePart(url:String):InputStream = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
response.getEntity.getContent
|
||||
|
||||
}
|
||||
|
||||
def requestPage(url:String):String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
while (tries > 0) {
|
||||
println(s"requesting ${r.getURI}")
|
||||
try {
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println(s"Error on requesting ${r.getURI}")
|
||||
e.printStackTrace()
|
||||
tries -= 1
|
||||
}
|
||||
}
|
||||
""
|
||||
} finally {
|
||||
if (client != null)
|
||||
client.close()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
|
||||
|
||||
|
||||
val conf = new Configuration
|
||||
conf.set("fs.defaultFS", hdfsServerUri)
|
||||
val fs = FileSystem.get(conf)
|
||||
val p = new Path(baselinePath)
|
||||
val files = fs.listFiles(p,false)
|
||||
var max_file = ""
|
||||
while (files.hasNext) {
|
||||
val c = files.next()
|
||||
val data = c.getPath.toString
|
||||
val fileName = data.substring(data.lastIndexOf("/")+1)
|
||||
|
||||
if (fileName> max_file)
|
||||
max_file = fileName
|
||||
}
|
||||
|
||||
val files_to_download = requestBaseLineUpdatePage(max_file)
|
||||
|
||||
files_to_download.foreach { u =>
|
||||
val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}")
|
||||
val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
|
||||
val i = downloadBaselinePart(u._2)
|
||||
val buffer = Array.fill[Byte](1024)(0)
|
||||
while(i.read(buffer)>0) {
|
||||
fsDataOutputStream.write(buffer)
|
||||
}
|
||||
i.close()
|
||||
println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
|
||||
fsDataOutputStream.close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
|
||||
|
@ -51,6 +163,10 @@ object SparkCreateBaselineDataFrame {
|
|||
val targetPath = parser.get("targetPath")
|
||||
log.info("targetPath: {}", targetPath)
|
||||
|
||||
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||
log.info("hdfsServerUri: {}", targetPath)
|
||||
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession =
|
||||
|
@ -61,16 +177,15 @@ object SparkCreateBaselineDataFrame {
|
|||
.master(parser.get("master")).getOrCreate()
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
|
||||
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
|
@ -87,7 +202,5 @@ object SparkCreateBaselineDataFrame {
|
|||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
|
||||
.filter(p => p!= null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
//s"$workingPath/oaf/baseline_oaf"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,17 +5,15 @@ import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
|||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
|
||||
import org.apache.http.client.methods.HttpGet
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkDownloadEBILinks {
|
||||
|
||||
|
||||
def createEBILinks(pmid:Long):EBILinkItem = {
|
||||
|
||||
val res = requestLinks(pmid)
|
||||
|
@ -24,39 +22,42 @@ object SparkDownloadEBILinks {
|
|||
null
|
||||
}
|
||||
|
||||
def requestPage(url:String):String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
while (tries > 0) {
|
||||
println(s"requesting ${r.getURI}")
|
||||
try {
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println(s"Error on requesting ${r.getURI}")
|
||||
e.printStackTrace()
|
||||
tries -= 1
|
||||
}
|
||||
}
|
||||
""
|
||||
} finally {
|
||||
if (client != null)
|
||||
client.close()
|
||||
}
|
||||
}
|
||||
|
||||
def requestLinks(PMID:Long):String = {
|
||||
val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
while (tries > 0) {
|
||||
println(s"requesting ${r.getURI}")
|
||||
try {
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println(s"Error on requesting ${r.getURI}")
|
||||
e.printStackTrace()
|
||||
tries -= 1
|
||||
}
|
||||
}
|
||||
""
|
||||
} finally {
|
||||
if (client != null)
|
||||
client.close()
|
||||
}
|
||||
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||
|
||||
}
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
|
|
@ -32,14 +32,9 @@ object SparkEBILinksToOaf {
|
|||
import spark.implicits._
|
||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
val ebi_rdd:Dataset[EBILinkItem] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => BioDBToOAF.extractEBILinksFromDump(s))).as[EBILinkItem]
|
||||
|
||||
ebi_rdd.write.mode(SaveMode.Overwrite).save(s"${sourcePath}_dataset")
|
||||
|
||||
val ebLinks:Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links!= null)
|
||||
|
||||
ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))
|
||||
.repartition(4000)
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
DROP VIEW IF EXISTS ${hiveDbName}.result;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.publication p
|
||||
union all
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.dataset d
|
||||
union all
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.software s
|
||||
union all
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o;
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.otherresearchproduct o;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl","paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath","paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
]
|
|
@ -1,5 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath","paramDescription": "the working path ", "paramRequired": true}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path ", "paramRequired": true}
|
||||
]
|
|
@ -25,7 +25,6 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="GenerateBaselineDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
|
@ -43,6 +42,7 @@
|
|||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -74,7 +74,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Baselnie DataSet</name>
|
||||
<name>Create Baseline DataSet</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
|
|
|
@ -1,59 +1,67 @@
|
|||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="DownloadEBILinks"/>
|
||||
<start to="DownloadEBILinks"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DownloadEBILinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Incremental Download EBI Links</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
||||
<action name="DownloadEBILinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Incremental Download EBI Links</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="OverrideFolders"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="OverrideFolders">
|
||||
<fs>
|
||||
<delete path="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
|
||||
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
|||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
|
||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
|
||||
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -50,6 +51,8 @@ class BioScholixTest extends AbstractVocabularyTest{
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -76,6 +76,39 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
|
||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureTraining() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureAirQualityCopernicus() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureB2SharePlotSw() throws IOException, TransformerException {
|
||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-sw.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureB2SharePlotRelatedORP() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
||||
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
||||
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>r37b0ad08687::a8df7db30ae0e4e0b875a098df7b652f</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2021-10-07T01:56:56Z</dri:dateOfCollection>
|
||||
<dri:status>under curation</dri:status>
|
||||
<counters/>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">Using CAMS European air quality analysis from Copernicus
|
||||
Atmosphere Monitoring with RELIANCE services
|
||||
</title>
|
||||
<creator rank="1" name="Simone" surname="Mantovani" orcid_pending="0000-0003-3979-3645">Simone Mantovani</creator>
|
||||
<dateofacceptance>2021-10-07</dateofacceptance>
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies"
|
||||
schemename="dnet:result_typologies"/>
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages"/>
|
||||
<description>
|
||||
This notebook shows how to discover and access the Copernicus Atmosphere Monitoring products available in the RELIANCE datacube resources.
|
||||
The process is structured in 6 steps, including example of data analysis and visualization with the Python libraries installed in the Jupyter environment
|
||||
</description>
|
||||
<country classid="" classname="" schemeid="" schemename=""/>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">EOSC Jupyter Notebook
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">RELIANCE
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">Copernicus
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">Air quality
|
||||
</subject>
|
||||
|
||||
<relevantdate classid="" classname="" schemeid="" schemename=""/>
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate/>
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol=""/>
|
||||
<source/>
|
||||
<fulltext/>
|
||||
<format/>
|
||||
<storagedate/>
|
||||
<resourcetype classid="" classname="" schemeid="" schemename=""/>
|
||||
<device/>
|
||||
<size/>
|
||||
<version/>
|
||||
<lastmetadataupdate/>
|
||||
<metadataversionnumber/>
|
||||
<documentationUrl/>
|
||||
<codeRepositoryUrl/>
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename=""/>
|
||||
<contactperson/>
|
||||
<contactgroup/>
|
||||
<tool/>
|
||||
<originalId>oai:zenodo.org:5554786</originalId>
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6"/>
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types">oai:zenodo.org:5554786
|
||||
</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types">10.5281/zenodo.5554786
|
||||
</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance/>
|
||||
<provenanceaction classid="user:insert" classname="user:insert"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">
|
||||
corda__h2020::8771f523c34e38902d4921037d545ef8
|
||||
</to>
|
||||
<title>REsearch LIfecycle mAnagemeNt for Earth Science Communities and CopErnicus users in EOSC</title>
|
||||
<code>101017501</code>
|
||||
<acronym>RELIANCE</acronym>
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="EU" />
|
||||
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
|
||||
<funding_level_1 name="RIA">ec__________::EC::H2020::RIA</funding_level_1>
|
||||
</funding>
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="r37b0ad08687::a8df7db30ae0e4e0b875a098df7b652f">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6"/>
|
||||
<hostedby name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6"/>
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<dateofacceptance/>
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/5554786</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,288 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_dedup___::44fd8a9b5b79adb0783ac245b21e3127</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2019-09-19T07:43:31+0000</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>2019-09-19T07:43:31+0000</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
|
||||
<oaf:result>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<originalId>10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6</originalId>
|
||||
<originalId>10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906</originalId>
|
||||
<originalId>6a93c069-a167-44cb-bfe8-74c275637347</originalId>
|
||||
<originalId>50|r3730f562f9e::9b434fedc00d568b8e00611a7fa19f41</originalId>
|
||||
<originalId>10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016</originalId>
|
||||
<originalId>ada23067-496a-494f-bd82-6ffe3cf4f0fb</originalId>
|
||||
<originalId>50|r3730f562f9e::b9cd774e8126b6902d56f9a4aa03e1dc</originalId>
|
||||
<originalId>f3bd1041-422c-439d-8e68-c1d0711d130d</originalId>
|
||||
<originalId>50|r3730f562f9e::b847821a0ca5365b0d971dd89dea6bf1</originalId>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</pid>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">HCG16 L-band VLA C+D array final data
|
||||
</title>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<creator rank="1" name="Michael G." surname="Jones">Jones, Michael G.</creator>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<description>These are the reduced final data associated with the paper Jones et al. 2019 submitted
|
||||
to Astronomy & Astrophysics. They are used by a mybinder (https://gke.mybinder.org/)
|
||||
executable environment to generate the final plots of that paper. The link for this environment
|
||||
is https://mybinder.org/v2/gh/AMIGA-IAA/hcg-16/master. The raw VLA D and C array data of HCG 16
|
||||
were collected by the Very Large Array (http://www.vla.nrao.edu/) in 1989 and 1999, under PI
|
||||
projects of Barbara Williams. The project numbers are AW234 and AW500 respectively. The file
|
||||
also includes a grz colour image and r-band image from DECaLS DR8
|
||||
(http://legacysurvey.org/decamls/), a GBT HI spectrum published in Borthakur et al. 2010 (ApJ
|
||||
710, 385), an HI data cube from HIPASS (https://www.atnf.csiro.au/research/multibeam/release/),
|
||||
and a source mask (and associated parameters file) for the HIPASS cube generated using SoFiA
|
||||
(https://github.com/SoFiA-Admin/SoFiA-2).
|
||||
</description>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">3.5.2.1.1 → Observational astronomy →
|
||||
Radio astronomy
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">HI
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">VLA
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">HCG16
|
||||
</subject>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">Various
|
||||
</subject>
|
||||
<language classid="und" classname="Undetermined" schemeid="dnet:languages"
|
||||
schemename="dnet:languages"/>
|
||||
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
|
||||
schemename="dnet:dataCite_date">2019-01-01
|
||||
</relevantdate>
|
||||
<publisher>https://b2share.eudat.eu</publisher>
|
||||
<resulttype classid="other" classname="other" schemeid="dnet:result_typologies"
|
||||
schemename="dnet:result_typologies"/>
|
||||
<resourcetype classid="UNKNOWN" classname="Unknown" schemeid="dnet:dataCite_resource"
|
||||
schemename="dnet:dataCite_resource"/>
|
||||
<datainfo>
|
||||
<inferred>true</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.8</trust>
|
||||
<inferenceprovenance>dedup-similarity-result-decisiontree-v2</inferenceprovenance>
|
||||
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="IsRelatedTo" scheme="dnet:result_result_relations" type="software">userclaim___::ee29372a239b79db3ac4c5debe44d6e6</to>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Plot scripts for HCG-16 Project</title>
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<result objidentifier="r3730f562f9e::9b434fedc00d568b8e00611a7fa19f41">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">HCG16 L-band VLA C+D
|
||||
array final data
|
||||
</title>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<publisher>B2SHARE</publisher>
|
||||
</result>
|
||||
<result objidentifier="doi_________::929c1e415f4bb04797679d1af7cb706f">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</pid>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">HCG16 L-band VLA C+D array final data
|
||||
</title>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<publisher>https://b2share.eudat.eu</publisher>
|
||||
</result>
|
||||
<result objidentifier="doi_________::e24d8a6399c5d8df9a78aed032573b81">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">HCG16 L-band VLA C+D array final data
|
||||
</title>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</pid>
|
||||
<publisher>https://b2share.eudat.eu</publisher>
|
||||
</result>
|
||||
<result objidentifier="r3730f562f9e::b9cd774e8126b6902d56f9a4aa03e1dc">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">HCG16 L-band VLA C+D
|
||||
array final data
|
||||
</title>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<publisher>B2SHARE</publisher>
|
||||
</result>
|
||||
<result objidentifier="doi_________::44fd8a9b5b79adb0783ac245b21e3127">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">HCG16 L-band VLA C+D array final data
|
||||
</title>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<publisher>https://b2share.eudat.eu</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</pid>
|
||||
</result>
|
||||
<result objidentifier="r3730f562f9e::b847821a0ca5365b0d971dd89dea6bf1">
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">HCG16 L-band VLA C+D
|
||||
array final data
|
||||
</title>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<publisher>B2SHARE</publisher>
|
||||
</result>
|
||||
<instance id="openaire____::55045bd2a65019fd8e6741a755395c8c">
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<hostedby name="Unknown Repository" id="openaire____::55045bd2a65019fd8e6741a755395c8c"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</pid>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>https://dx.doi.org/10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance id="openaire____::55045bd2a65019fd8e6741a755395c8c">
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<hostedby name="Unknown Repository" id="openaire____::55045bd2a65019fd8e6741a755395c8c"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</pid>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>https://dx.doi.org/10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance id="re3data_____::730f562f9efe8a3b3742d2da510d4335">
|
||||
<accessright classid="UNKNOWN" classname="not available" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<hostedby name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0000" classname="Unknown" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<alternateidentifier classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">
|
||||
https://doi.org10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</alternateidentifier>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>
|
||||
http://dx.doi.org/https://doi.org/10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance id="openaire____::55045bd2a65019fd8e6741a755395c8c">
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<hostedby name="Unknown Repository" id="openaire____::55045bd2a65019fd8e6741a755395c8c"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</pid>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>https://dx.doi.org/10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance id="re3data_____::730f562f9efe8a3b3742d2da510d4335">
|
||||
<accessright classid="UNKNOWN" classname="not available" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<hostedby name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0000" classname="Unknown" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<alternateidentifier classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">
|
||||
https://doi.org10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</alternateidentifier>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>
|
||||
http://dx.doi.org/https://doi.org/10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance id="re3data_____::730f562f9efe8a3b3742d2da510d4335">
|
||||
<accessright classid="UNKNOWN" classname="not available" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<hostedby name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<instancetype classid="0000" classname="Unknown" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<alternateidentifier classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">
|
||||
https://doi.org10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</alternateidentifier>
|
||||
<refereed classid="0000" classname="Unknown" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<webresource>
|
||||
<url>
|
||||
http://dx.doi.org/https://doi.org/10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,112 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>userclaim___::ee29372a239b79db3ac4c5debe44d6e6</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2021-10-07T12:42:54Z</dri:dateOfCollection>
|
||||
<counters/>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title">Plot scripts for HCG-16 Project
|
||||
</title>
|
||||
<creator rank="1" name="Michael G." surname="Jones">Jones, Michael G.</creator>
|
||||
<creator rank="2" name="Sebastián" surname="Luna-Valero">Jones, Michael G.</creator>
|
||||
<dateofacceptance>2021-09-30</dateofacceptance>
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies"
|
||||
schemename="dnet:result_typologies"/>
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages"/>
|
||||
<description>These are the notebooks to general the final data plots of the paper Jones et al. 2019
|
||||
submitted to Astronomy & Astrophysics. They can be used in a notebooks environment (like
|
||||
https://notebooks.egi.eu/) with the proper libraries installed. A mybinder
|
||||
(https://mybinder.org/)
|
||||
ready version can be started from https://mybinder.org/v2/gh/AMIGA-IAA/hcg-16/master. Data to
|
||||
generate plots is also available from B2SHARE:
|
||||
https://b2share.eudat.eu/records/a69a7b2dcc22449e8734552dde4d3906
|
||||
</description>
|
||||
<country classid="" classname="" schemeid="" schemename=""/>
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies">EOSC Jupyter Notebook
|
||||
</subject>
|
||||
<relevantdate classid="" classname="" schemeid="" schemename=""/>
|
||||
<publisher>B2SHARE</publisher>
|
||||
<embargoenddate/>
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol=""/>
|
||||
<source/>
|
||||
<fulltext/>
|
||||
<format/>
|
||||
<storagedate/>
|
||||
<resourcetype classid="" classname="" schemeid="" schemename=""/>
|
||||
<device/>
|
||||
<size/>
|
||||
<version/>
|
||||
<lastmetadataupdate/>
|
||||
<metadataversionnumber/>
|
||||
<documentationUrl/>
|
||||
<codeRepositoryUrl/>
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename=""/>
|
||||
<contactperson/>
|
||||
<contactgroup/>
|
||||
<tool/>
|
||||
<originalId>userclaim___::ee29372a239b79db3ac4c5debe44d6e6</originalId>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types">10.23728/b2share.adf6e2e942b04561a8640c449b48c14a
|
||||
</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance/>
|
||||
<provenanceaction classid="user:insert" classname="user:insert"
|
||||
schemeid="dnet:provenanceActions"
|
||||
schemename="dnet:provenanceActions"/>
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="IsRelatedTo" scheme="dnet:result_result_relations" type="otherresearchproduct">doi_dedup___::44fd8a9b5b79adb0783ac245b21e3127</to>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">HCG16 L-band VLA C+D array final data</title>
|
||||
<dateofacceptance>2019-01-01</dateofacceptance>
|
||||
<publisher>https://b2share.eudat.eu</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
|
||||
</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
|
||||
</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset"
|
||||
trust="0.9">10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
|
||||
</pid>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="userclaim___::ee29372a239b79db3ac4c5debe44d6e6">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<collectedfrom name="B2FIND" id="re3data_____::730f562f9efe8a3b3742d2da510d4335"/>
|
||||
<hostedby name="B2SHARE" id="re3data_____::ad3609c351bd520edf6f10f5e0d9b877"/>
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<dateofacceptance>2021-09-30</dateofacceptance>
|
||||
<webresource>
|
||||
<url>http://dx.doi.org/10.23728/b2share.adf6e2e942b04561a8640c449b48c14a</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,71 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_dedup___::ab57f086011a9ae23d1165211dc6e04b</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2020-11-03T05:39:50+0000</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>2020-11-03T05:39:50+0000</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">EGI-Foundation/data-transfer-pilot: Include libraries in environment.yml</title>
|
||||
<creator rank="1" name="Giuseppe" surname="La Rocca">Giuseppe La Rocca</creator>
|
||||
<creator rank="2" name="Enol" surname="Fernández">Enol Fernández</creator>
|
||||
<creator rank="3" name="Andrea" surname="Manzi">Andrea Manzi</creator>
|
||||
<dateofacceptance>2020-11-03</dateofacceptance>
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
|
||||
<description>This notebook is used to demonstrate how a scientist from one of the PaNOSC RIs can use the resources provided by EGI to perform analysis on the data sets obtained during an expirement.</description>
|
||||
<country classid="" classname="" schemeid="" schemename="" />
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">EOSC Jupyter Notebook</subject>
|
||||
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date" schemename="dnet:dataCite_date">2020-11-03</relevantdate>
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate />
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
|
||||
<source />
|
||||
<fulltext />
|
||||
<format />
|
||||
<storagedate />
|
||||
<resourcetype classid="" classname="" schemeid="" schemename="" />
|
||||
<device />
|
||||
<size />
|
||||
<version />
|
||||
<lastmetadataupdate />
|
||||
<metadataversionnumber />
|
||||
<documentationUrl />
|
||||
<codeRepositoryUrl />
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
|
||||
<contactperson />
|
||||
<contactgroup />
|
||||
<tool />
|
||||
<originalId>oai:zenodo.org:4218562</originalId>
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:4218562</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.4195418</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.4218562</pid><bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance />
|
||||
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
|
||||
</datainfo>
|
||||
<rels></rels>
|
||||
<children>
|
||||
<instance id="r37b0ad08687::dec0d8520e726f2adda9a51280ac7299">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<hostedby name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<dateofacceptance>2020-11-03</dateofacceptance>
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/4218562</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,72 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_dedup___::8539a8de8996e01350f0de8ca4899b7f</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2021-09-22T08:53:13Z</dri:dateOfCollection>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">EGI-Foundation/training-notebooks-seadatanet: Version 0.4</title>
|
||||
<creator rank="1" name="" surname="">Enol Fernández</creator>
|
||||
<dateofacceptance>2019-12-04</dateofacceptance>
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
|
||||
<description>A sample notebook using SeaDataNet data to plot a map that shows surface temperature of Black Sea, Arctic Sea and Baltic Sea. The data is available at EGI DataHub with PID http://hdl.handle.net/21.T15999/3Byz9Cw (run at EGI Notebooks service for easy access to data). This release uses the correct path of the data share from the EGI DataHub.</description>
|
||||
<country classid="" classname="" schemeid="" schemename="" />
|
||||
<subject classid="keyword" classname="keyword" schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">EOSC Jupyter Notebook</subject>
|
||||
<relevantdate classid="" classname="" schemeid="" schemename="" />
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate />
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
|
||||
<source />
|
||||
<fulltext />
|
||||
<format />
|
||||
<storagedate />
|
||||
<resourcetype classid="" classname="" schemeid="" schemename="" />
|
||||
<device />
|
||||
<size />
|
||||
<version />
|
||||
<lastmetadataupdate />
|
||||
<metadataversionnumber />
|
||||
<documentationUrl />
|
||||
<codeRepositoryUrl />
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
|
||||
<contactperson />
|
||||
<contactgroup />
|
||||
<tool />
|
||||
<originalId>oai:zenodo.org:3561323</originalId>
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:3561323</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.3561323</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.3443996</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.3475539</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false" provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.3475785</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance />
|
||||
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
|
||||
</datainfo>
|
||||
<rels></rels>
|
||||
<children>
|
||||
<instance id="r37b0ad08687::eb430fb7438e1533ba95d6aa50a477eb">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
|
||||
<collectedfrom name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<hostedby name="Zenodo" id="re3data_____::7b0ad08687b2c960d5aeef06f811d5e6" />
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<dateofacceptance />
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/3561323</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -15,7 +15,13 @@
|
|||
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classid"/>
|
||||
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classname"/>
|
||||
<FIELD indexable="true" multivalued="true" name="datasourcesubject" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='datasource']/subjects"/>
|
||||
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/><!-- ORGANIZATION FIELDS -->
|
||||
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/>
|
||||
<!-- datasource fields for EOSC -->
|
||||
<FIELD indexable="true" name="datasourcejurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/jurisdiction"/>
|
||||
<FIELD indexable="true" name="datasourcethematic" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/thematic"/>
|
||||
<FIELD indexable="true" name="datasourceknowledge_graph" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/knowledgegraph"/>
|
||||
<FIELD indexable="true" name="datasourcecontentpolicy" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/contentpolicy"/>
|
||||
<!-- ORGANIZATION FIELDS -->
|
||||
<FIELD indexable="true" name="organizationlegalshortname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalshortname)"/>
|
||||
<FIELD indexable="true" name="organizationlegalname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalname)"/>
|
||||
<FIELD indexable="true" name="organizationalternativenames" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//alternativeNames)"/>
|
||||
|
@ -28,7 +34,8 @@
|
|||
<FIELD indexable="true" name="organizationecenterprise" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecenterprise"/>
|
||||
<FIELD indexable="true" name="organizationecsmevalidated" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecsmevalidated"/>
|
||||
<FIELD indexable="true" name="organizationecnutscode" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecnutscode"/>
|
||||
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/><!-- PROJECT FIELDS -->
|
||||
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/>
|
||||
<!-- PROJECT FIELDS -->
|
||||
<FIELD indexable="true" name="projectcode" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
|
||||
<FIELD indexable="true" name="projectcode_nt" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
|
||||
<FIELD indexable="true" name="projectacronym" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/acronym"/>
|
||||
|
@ -79,6 +86,7 @@
|
|||
<FIELD indexable="true" multivalued="true" name="resultauthor_nt" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
|
||||
<FIELD indexable="true" multivalued="true" name="authorid" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']"/>
|
||||
<FIELD indexable="true" multivalued="true" name="authoridtype" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']/local-name()"/>
|
||||
<FIELD indexable="true" multivalued="true" name="orcidtypevalue" result="false" stat="false" type="string_ci" value="string-join((./@*[local-name() = 'orcid' or local-name() = 'orcid_pending'], ./@*[local-name() = 'orcid' or local-name() = 'orcid_pending']/local-name()), '||' )" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
|
||||
<FIELD indexable="true" name="resulthostingdatasource" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='hostedby']"/>
|
||||
<FIELD indexable="true" name="resulthostingdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='hostedby']/@id)"/>
|
||||
<FIELD indexable="true" name="resulthostingdatasourcename" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='hostedby']/@name)"/>
|
||||
|
@ -105,7 +113,7 @@
|
|||
<FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
|
||||
<FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
|
||||
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
|
||||
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
|
||||
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
|
||||
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
|
||||
<FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
|
||||
<FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
|
||||
|
@ -130,7 +138,7 @@
|
|||
<FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
|
||||
<FIELD indexable="true" name="collectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@name | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@name)"/>
|
||||
<FIELD indexable="true" name="originalid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/*[local-name()='originalId']"/>
|
||||
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/pid/text()"/>
|
||||
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*/pid/text()|//*[local-name()='instance']/*[local-name()='alternateidentifier']/text())"/>
|
||||
<FIELD indexable="true" name="pidclassid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classid)"/>
|
||||
<FIELD indexable="true" name="pidclassname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classname)"/>
|
||||
<FIELD indexable="true" name="inferred" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//datainfo/inferred"/>
|
||||
|
|
|
@ -16,7 +16,7 @@ curl -L ${CONTEXT_API}/contexts/?type=ri,community -H "accept: application/json"
|
|||
cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv
|
||||
cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv
|
||||
cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv
|
||||
cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
|
||||
cat categories.csv | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv
|
||||
|
||||
echo "uploading context data to hdfs"
|
||||
hdfs dfs -mkdir ${TMP}
|
||||
|
|
|
@ -15,5 +15,5 @@ hdfs dfs -copyToLocal $SCRIPT_PATH
|
|||
echo "Creating indicators"
|
||||
impala-shell -q "invalidate metadata"
|
||||
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -c -f -
|
||||
cat step16_7-createIndicatorsTables.sql | impala-shell -d $TARGET -f -
|
||||
cat step16-createIndicatorsTables.sql | impala-shell -d $TARGET -f -
|
||||
echo "Indicators created"
|
|
@ -9,16 +9,9 @@ fi
|
|||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
export SCRIPT_PATH=$4
|
||||
|
||||
echo "Getting file from " $4
|
||||
hdfs dfs -copyToLocal $4
|
||||
|
||||
echo "Creating observatory database"
|
||||
impala-shell -q "drop database if exists ${TARGET} cascade"
|
||||
impala-shell -q "create database if not exists ${TARGET}"
|
||||
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
|
||||
cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f -
|
||||
impala-shell -q "invalidate metadata;"
|
||||
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/compute stats ${TARGET}.\1;/" | impala-shell -f -
|
||||
echo "Impala shell finished"
|
||||
|
||||
echo "Updating shadow observatory database"
|
|
@ -0,0 +1,16 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export SOURCE=$1
|
||||
export TARGET=$2
|
||||
export SHADOW=$3
|
||||
|
||||
echo "Creating observatory database"
|
||||
impala-shell -q "drop database if exists ${TARGET} cascade"
|
||||
impala-shell -q "create database if not exists ${TARGET}"
|
||||
impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
|
|
@ -23,6 +23,11 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
|
|||
SELECT *
|
||||
FROM ${external_stats_db_name}.rndexpediture;
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
|
||||
SELECT *
|
||||
FROM ${external_stats_db_name}.licenses_normalized;
|
||||
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Creation date of the database
|
||||
|
|
|
@ -233,4 +233,50 @@ on p.id= tmp.id;
|
|||
|
||||
create table indi_pub_has_abstract stored as parquet as
|
||||
select distinct publication.id, coalesce(abstract, 1) has_abstract
|
||||
from publication;
|
||||
from publication;
|
||||
|
||||
create table indi_with_orcid stored as parquet as
|
||||
select distinct r.id, coalesce(has_orcid, 0) as has_orcid
|
||||
from result r
|
||||
left outer join (select id, 1 as has_orcid from result_orcid) tmp
|
||||
on r.id= tmp.id
|
||||
|
||||
create table indi_funded_result_with_fundref stored as parquet as
|
||||
select distinct r.id, coalesce(fundref, 0) as fundref
|
||||
from project_results r
|
||||
left outer join (select distinct id, 1 as fundref from project_results
|
||||
where provenance='Harvested') tmp
|
||||
on r.id= tmp.id
|
||||
|
||||
create table indi_result_org_country_collab stored as parquet as
|
||||
with tmp as
|
||||
(select o.id as id, o.country , ro.id as result,r.type from organization o
|
||||
join result_organization ro on o.id=ro.organization
|
||||
join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
||||
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.id<>o2.id and o1.country<>o2.country
|
||||
group by o1.id, o1.type,o2.country
|
||||
|
||||
create table indi_result_org_collab stored as parquet as
|
||||
with tmp as
|
||||
(select o.id, ro.id as result,r.type from organization o
|
||||
join result_organization ro on o.id=ro.organization
|
||||
join result r on r.id=ro.id)
|
||||
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.id<>o2.id
|
||||
group by o1.id, o2.id, o1.type
|
||||
|
||||
create table indi_result_org_country_collab stored as parquet as
|
||||
with tmp as
|
||||
(select o.id as id, o.country , ro.id as result,r.type from organization o
|
||||
join result_organization ro on o.id=ro.organization
|
||||
join result r on r.id=ro.id where o.country <> 'UNKNOWN')
|
||||
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
|
||||
from tmp as o1
|
||||
join tmp as o2 on o1.result=o2.result
|
||||
where o1.id<>o2.id and o1.country<>o2.country
|
||||
group by o1.id, o1.type,o2.country
|
|
@ -1,62 +0,0 @@
|
|||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
-- Results that have been collected from Crossref
|
||||
create table ${stats_db_name}.result_peerreviewed as
|
||||
with peer_reviewed as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_sources rs on rs.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rs.datasource
|
||||
where d.name='Crossref')
|
||||
select distinct peer_reviewed.id as id, true as peer_reviewed
|
||||
from peer_reviewed
|
||||
union all
|
||||
select distinct r.id as id, false as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join peer_reviewed pr on pr.id=r.id
|
||||
where pr.id is null;
|
||||
|
||||
-- Green OA:
|
||||
-- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal.
|
||||
create table ${stats_db_name}.result_greenoa as
|
||||
with result_green as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
left outer join (
|
||||
select rd.id from ${stats_db_name}.result_datasources rd
|
||||
join ${stats_db_name}.datasource d on rd.datasource=d.id
|
||||
join ${stats_db_name}.datasource_sources sds on sds.id=d.id
|
||||
join ${stats_db_name}.datasource sd on sd.id=sds.datasource
|
||||
where sd.name='DOAJ-ARTICLES'
|
||||
) as doaj on doaj.id=r.id
|
||||
where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null)
|
||||
select distinct result_green.id, true as green
|
||||
from result_green
|
||||
union all
|
||||
select distinct r.id as id, false as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join result_green rg on rg.id=r.id
|
||||
where rg.id is null;
|
||||
|
||||
-- GOLD OA:
|
||||
-- OA results that have been harvested from a DOAJ journal.
|
||||
create table ${stats_db_name}.result_gold as
|
||||
with result_gold as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
join ${stats_db_name}.datasource_sources sds on sds.id=d.id
|
||||
join ${stats_db_name}.datasource sd on sd.id=sds.datasource
|
||||
where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles')
|
||||
select distinct result_gold.id, true as gold
|
||||
from result_gold
|
||||
union all
|
||||
select distinct r.id, false as gold
|
||||
from ${stats_db_name}.result r
|
||||
where r.id not in (select id from result_gold);
|
|
@ -0,0 +1,22 @@
|
|||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
create table ${stats_db_name}.result_peerreviewed as
|
||||
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
|
||||
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id;
|
||||
|
||||
-- Green OA:
|
||||
create table ${stats_db_name}.result_greenoa as
|
||||
select r.id, case when green.green_oa=1 then true else false end as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id;
|
||||
|
||||
-- GOLD OA:
|
||||
create table ${stats_db_name}.result_gold as
|
||||
select r.id, case when gold.gold_oa=1 then true else false end as gold
|
||||
from ${stats_db_name}.result r
|
||||
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;
|
|
@ -104,25 +104,42 @@ create table TARGET.project_results as select id as result, project as id from T
|
|||
compute stats TARGET.project_results;
|
||||
|
||||
-- indicators
|
||||
create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_green_oa;
|
||||
|
||||
create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_grey_lit;
|
||||
|
||||
create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_doi_from_crossref;
|
||||
|
||||
create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_gold_oa;
|
||||
|
||||
create view TARGET.indi_dataset_avg_year_content_oa as select * from SOURCE.indi_dataset_avg_year_content_oa orig;
|
||||
create view TARGET.indi_dataset_avg_year_context_oa as select * from SOURCE.indi_dataset_avg_year_context_oa orig;
|
||||
create view TARGET.indi_dataset_avg_year_country_oa as select * from SOURCE.indi_dataset_avg_year_country_oa orig;
|
||||
|
||||
create view TARGET.indi_other_avg_year_content_oa as select * from SOURCE.indi_other_avg_year_content_oa orig;
|
||||
create view TARGET.indi_other_avg_year_context_oa as select * from SOURCE.indi_other_avg_year_context_oa orig;
|
||||
create view TARGET.indi_other_avg_year_country_oa as select * from SOURCE.indi_other_avg_year_country_oa orig;
|
||||
|
||||
create view TARGET.indi_project_datasets_count as select * from SOURCE.indi_project_datasets_count orig;
|
||||
create view TARGET.indi_project_otherresearch_count as select * from SOURCE.indi_project_otherresearch_count orig;
|
||||
create view TARGET.indi_project_pubs_count as select * from SOURCE.indi_project_pubs_count orig;
|
||||
create view TARGET.indi_project_software_count as select * from SOURCE.indi_project_software_count orig;
|
||||
|
||||
create view TARGET.indi_pub_avg_year_content_oa as select * from SOURCE.indi_pub_avg_year_content_oa orig;
|
||||
create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig;
|
||||
create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig;
|
||||
|
||||
create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_green_oa;
|
||||
create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_grey_lit;
|
||||
create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_doi_from_crossref;
|
||||
create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_gold_oa;
|
||||
create table TARGET.indi_pub_has_abstract as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_has_abstract;
|
||||
create table TARGET.indi_pub_has_cc_licence as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_has_cc_licence;
|
||||
create table TARGET.indi_pub_has_cc_licence_url as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||
compute stats TARGET.indi_pub_has_cc_licence_url;
|
||||
|
||||
create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig;
|
||||
create view TARGET.indi_software_avg_year_context_oa as select * from SOURCE.indi_software_avg_year_context_oa orig;
|
||||
create view TARGET.indi_software_avg_year_country_oa as select * from SOURCE.indi_software_avg_year_country_oa orig;
|
||||
|
||||
--denorm
|
||||
alter table TARGET.result rename to TARGET.res_tmp;
|
||||
|
||||
|
|
|
@ -1,259 +1,561 @@
|
|||
create table TARGET.result_affiliated_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_cc_licence stored as parquet as
|
||||
select r.id, coalesce(rln.count, 0) > 0 as cc_licence
|
||||
from ${stats_db_name}.result r
|
||||
left outer join (
|
||||
select rl.id, sum(case when lower(rln.normalized) like 'cc-%' then 1 else 0 end) as count
|
||||
from ${stats_db_name}.result_licenses rl
|
||||
left outer join ${stats_db_name}.licenses_normalized rln on rl.type=rln.license
|
||||
group by rl.id
|
||||
) rln on rln.id=r.id;
|
||||
|
||||
create table TARGET.result_affiliated_year stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
|
||||
create table ${observatory_db_name}.result_affiliated_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
|
||||
|
||||
create table TARGET.result_affiliated_year_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_affiliated_year stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
r.year
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
|
||||
|
||||
create table TARGET.result_affiliated_datasource stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_datasources rd on rd.id=r.id
|
||||
left outer join SOURCE.datasource d on d.id=rd.datasource
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
|
||||
create table ${observatory_db_name}.result_affiliated_year_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
r.year, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
|
||||
|
||||
create table TARGET.result_affiliated_datasource_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_datasources rd on rd.id=r.id
|
||||
left outer join SOURCE.datasource d on d.id=rd.datasource
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_affiliated_datasource stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
d.name as dname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
left outer join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
|
||||
|
||||
create table TARGET.result_affiliated_organization stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, o.name as oname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
|
||||
create table ${observatory_db_name}.result_affiliated_datasource_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
d.name as dname, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
left outer join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
|
||||
|
||||
create table TARGET.result_affiliated_organization_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_affiliated_organization stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
o.name as oname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
|
||||
|
||||
create table TARGET.result_affiliated_funder stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
|
||||
create table ${observatory_db_name}.result_affiliated_organization_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
o.name as oname, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
|
||||
|
||||
create table TARGET.result_affiliated_funder_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_organization ro on ro.id=r.id
|
||||
join SOURCE.organization o on o.id=ro.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_affiliated_funder stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
p.funder as pfunder
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
|
||||
|
||||
create table TARGET.result_deposited_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_affiliated_funder_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
p.funder as pfunder, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
|
||||
|
||||
create table TARGET.result_deposited_year stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
|
||||
create table ${observatory_db_name}.result_deposited_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, c.code, c.name;
|
||||
|
||||
create table TARGET.result_deposited_year_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_deposited_year stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
r.year
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year;
|
||||
|
||||
create table TARGET.result_deposited_datasource stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, d.name as dname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
|
||||
create table ${observatory_db_name}.result_deposited_year_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
r.year, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, r.year, c.code, c.name;
|
||||
|
||||
create table TARGET.result_deposited_datasource_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_deposited_datasource stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
d.name as dname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name;
|
||||
|
||||
create table TARGET.result_deposited_organization stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
|
||||
create table ${observatory_db_name}.result_deposited_datasource_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
d.name as dname, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, d.name, c.code, c.name;
|
||||
|
||||
create table TARGET.result_deposited_organization_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_deposited_organization stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
o.name as oname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name;
|
||||
|
||||
create table TARGET.result_deposited_funder stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, p.funder as pfunder
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
|
||||
create table ${observatory_db_name}.result_deposited_organization_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
o.name as oname, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, o.name, c.code, c.name;
|
||||
|
||||
create table TARGET.result_deposited_funder_country stored as parquet as
|
||||
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
|
||||
from SOURCE.result r
|
||||
join SOURCE.result_datasources rd on rd.id=r.id
|
||||
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join SOURCE.datasource_organizations dor on dor.id=d.id
|
||||
join SOURCE.organization o on o.id=dor.organization
|
||||
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join SOURCE.result_projects rp on rp.id=r.id
|
||||
join SOURCE.project p on p.id=rp.project
|
||||
left outer join SOURCE.result_licenses rl on rl.id=r.id
|
||||
left outer join SOURCE.result_pids pids on pids.id=r.id
|
||||
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
|
||||
create table ${observatory_db_name}.result_deposited_funder stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
p.funder as pfunder
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder;
|
||||
|
||||
compute stats TARGET.result_affiliated_country;
|
||||
compute stats TARGET.result_affiliated_year;
|
||||
compute stats TARGET.result_affiliated_year_country;
|
||||
compute stats TARGET.result_affiliated_datasource;
|
||||
compute stats TARGET.result_affiliated_datasource_country;
|
||||
compute stats TARGET.result_affiliated_organization;
|
||||
compute stats TARGET.result_affiliated_organization_country;
|
||||
compute stats TARGET.result_affiliated_funder;
|
||||
compute stats TARGET.result_affiliated_funder_country;
|
||||
compute stats TARGET.result_deposited_country;
|
||||
compute stats TARGET.result_deposited_year;
|
||||
compute stats TARGET.result_deposited_year_country;
|
||||
compute stats TARGET.result_deposited_datasource;
|
||||
compute stats TARGET.result_deposited_datasource_country;
|
||||
compute stats TARGET.result_deposited_organization;
|
||||
compute stats TARGET.result_deposited_organization_country;
|
||||
compute stats TARGET.result_deposited_funder;
|
||||
compute stats TARGET.result_deposited_funder_country;
|
||||
create table ${observatory_db_name}.result_deposited_funder_country stored as parquet as
|
||||
select
|
||||
count(distinct r.id) as total,
|
||||
r.green,
|
||||
r.gold,
|
||||
case when rl.type is not null then true else false end as licence,
|
||||
case when pids.pid is not null then true else false end as pid,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
|
||||
r.peer_reviewed,
|
||||
rln.cc_licence,
|
||||
r.abstract as abstract,
|
||||
r.authors > 1 as multiple_authors,
|
||||
rpc.count > 1 as multiple_projects,
|
||||
rfc.count > 1 as multiple_funders,
|
||||
r.type,
|
||||
p.funder as pfunder, c.code as ccode, c.name as cname
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
join ${stats_db_name}.country c on c.code=o.country and c.continent_name='Europe'
|
||||
join ${stats_db_name}.result_projects rp on rp.id=r.id
|
||||
join ${stats_db_name}.project p on p.id=rp.project
|
||||
left outer join ${stats_db_name}.result_licenses rl on rl.id=r.id
|
||||
left outer join ${stats_db_name}.result_pids pids on pids.id=r.id
|
||||
left outer join ${observatory_db_name}.result_cc_licence rln on rln.id=r.id
|
||||
left outer join ${stats_db_name}.result_projectcount rpc on rpc.id=r.id
|
||||
left outer join ${stats_db_name}.result_fundercount rfc on rfc.id=r.id
|
||||
group by r.green, r.gold, case when rl.type is not null then true else false end, case when pids.pid is not null then true else false end,
|
||||
case when r.access_mode in ('Open Access', 'Open Source') then true else false end, r.peer_reviewed, r.type, abstract,
|
||||
cc_licence, r.authors > 1, rpc.count > 1, rfc.count > 1, p.funder, c.code, c.name;
|
|
@ -239,14 +239,51 @@
|
|||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step16"/>
|
||||
<ok to="Step15_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16">
|
||||
<action name="Step15_5">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step16.sql</script>
|
||||
<script>scripts/step15_5.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Contexts"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Contexts">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>contexts.sh</exec>
|
||||
<argument>${context_api_url}</argument>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<file>contexts.sh</file>
|
||||
</shell>
|
||||
<ok to="Step16-createIndicatorsTables"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16-createIndicatorsTables">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>indicators.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/step16-createIndicatorsTables.sql</argument>
|
||||
<file>indicators.sh</file>
|
||||
</shell>
|
||||
<ok to="Step16_1-definitions"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16_1-definitions">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step16_1-definitions.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
|
@ -261,48 +298,11 @@
|
|||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step16_6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16_6">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step16_6.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>openaire_db_name=${openaire_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="Step16_7-createIndicatorsTables"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step16_7-createIndicatorsTables">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>indicators.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/step16_7-createIndicatorsTables.sql</argument>
|
||||
<file>indicators.sh</file>
|
||||
</shell>
|
||||
<ok to="Step17"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step17">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>contexts.sh</exec>
|
||||
<argument>${context_api_url}</argument>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<file>contexts.sh</file>
|
||||
</shell>
|
||||
<ok to="Step19"/>
|
||||
<ok to="Step19-finalize"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step19">
|
||||
<action name="Step19-finalize">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
|
@ -326,20 +326,44 @@
|
|||
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
|
||||
<file>monitor.sh</file>
|
||||
</shell>
|
||||
<ok to="step21-createObservatoryDB-pre"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="step21-createObservatoryDB-pre">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>observatory-pre.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<file>observatory-pre.sh</file>
|
||||
</shell>
|
||||
<ok to="step21-createObservatoryDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="step21-createObservatoryDB">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step21-createObservatoryDB.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>observatory_db_name=${observatory_db_name}</param>
|
||||
</hive2>
|
||||
<ok to="step21-createObservatoryDB-post"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="step21-createObservatoryDB-post">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>observatory.sh</exec>
|
||||
<exec>observatory-post.sh</exec>
|
||||
<argument>${stats_db_name}</argument>
|
||||
<argument>${observatory_db_name}</argument>
|
||||
<argument>${observatory_db_shadow_name}</argument>
|
||||
<argument>${wf:appPath()}/scripts/step21-createObservatoryDB.sql</argument>
|
||||
<file>observatory.sh</file>
|
||||
<file>observatory-post.sh</file>
|
||||
</shell>
|
||||
<ok to="Step22"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue