forked from D-Net/dnet-hadoop
Compare commits
2 Commits
master
...
deduptesti
Author | SHA1 | Date |
---|---|---|
miconis | d47352cbc7 | |
miconis | b260fee787 |
|
@ -1,22 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.utils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.ws.BindingProvider;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class ISLookupClientFactory {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
|
||||
|
||||
private static int requestTimeout = 60000 * 10;
|
||||
private static int connectTimeout = 60000 * 10;
|
||||
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
|
@ -28,25 +21,6 @@ public class ISLookupClientFactory {
|
|||
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
|
||||
jaxWsProxyFactory.setServiceClass(clazz);
|
||||
jaxWsProxyFactory.setAddress(endpoint);
|
||||
|
||||
final T service = (T) jaxWsProxyFactory.create();
|
||||
|
||||
if (service instanceof BindingProvider) {
|
||||
log
|
||||
.info(
|
||||
"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
|
||||
BindingProvider.class.getName(), requestTimeout, connectTimeout);
|
||||
|
||||
Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
|
||||
|
||||
requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
|
||||
requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
|
||||
requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
|
||||
requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
|
||||
requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
|
||||
requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
|
||||
}
|
||||
|
||||
return service;
|
||||
return (T) jaxWsProxyFactory.create();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,37 +14,6 @@
|
|||
|
||||
<description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.common;
|
||||
|
||||
import java.security.Key;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
|
||||
|
||||
object OafUtils {
|
||||
|
||||
|
||||
|
||||
def generateKeyValue(key: String, value: String): KeyValue = {
|
||||
val kv: KeyValue = new KeyValue()
|
||||
kv.setKey(key)
|
||||
kv.setValue(value)
|
||||
kv.setDataInfo(generateDataInfo("0.9"))
|
||||
kv
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||
di
|
||||
}
|
||||
|
||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||
createQualifier(cls, cls, sch, sch)
|
||||
}
|
||||
|
||||
|
||||
def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
|
||||
val q: Qualifier = new Qualifier
|
||||
q.setClassid(classId)
|
||||
q.setClassname(className)
|
||||
q.setSchemeid(schemeId)
|
||||
q.setSchemename(schemeName)
|
||||
q
|
||||
}
|
||||
|
||||
|
||||
def asField[T](value: T): Field[T] = {
|
||||
val tmp = new Field[T]
|
||||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -34,10 +34,7 @@ public class EventFactory {
|
|||
final MappedFields map = createMapFromResult(updateInfo);
|
||||
|
||||
final String eventId = calculateEventId(
|
||||
updateInfo.getTopicPath(), updateInfo.getTargetDs().getOpenaireId(), updateInfo
|
||||
.getTarget()
|
||||
.getOpenaireId(),
|
||||
updateInfo.getHighlightValueAsString());
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());
|
||||
|
||||
res.setEventId(eventId);
|
||||
res.setProducerId(PRODUCER_ID);
|
||||
|
@ -96,13 +93,11 @@ public class EventFactory {
|
|||
return map;
|
||||
}
|
||||
|
||||
private static String calculateEventId(final String topic, final String dsId, final String publicationId,
|
||||
final String value) {
|
||||
private static String calculateEventId(final String topic, final String publicationId, final String value) {
|
||||
return "event-"
|
||||
+ DigestUtils.md5Hex(topic).substring(0, 4) + "-"
|
||||
+ DigestUtils.md5Hex(dsId).substring(0, 4) + "-"
|
||||
+ DigestUtils.md5Hex(publicationId).substring(0, 7) + "-"
|
||||
+ DigestUtils.md5Hex(value).substring(0, 5);
|
||||
+ DigestUtils.md5Hex(topic).substring(0, 6) + "-"
|
||||
+ DigestUtils.md5Hex(publicationId).substring(0, 8) + "-"
|
||||
+ DigestUtils.md5Hex(value).substring(0, 8);
|
||||
}
|
||||
|
||||
private static long calculateExpiryDate(final long now) {
|
||||
|
|
|
@ -48,13 +48,12 @@ public class IndexOnESJob {
|
|||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
// .limit(10000) // TODO REMOVE
|
||||
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
||||
|
||||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
|
|
|
@ -64,11 +64,182 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="index_es"/>
|
||||
<start to="join_entities_step0"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="join_entities_step0">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinStep0</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinStep0Job</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="join_entities_step1">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinStep1</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinStep1Job</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="join_entities_step2">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinStep2</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinStep2Job</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="join_entities_step3">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinStep3</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinStep3Job</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="join_entities_step4">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>JoinStep4</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.JoinStep4Job</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="prepare_groups"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_groups">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareGroupsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="generate_events"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="generate_events">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEventsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||
</spark>
|
||||
<ok to="index_es"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="index_es">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
@ -91,10 +262,34 @@
|
|||
<arg>--index</arg><arg>${esIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
</spark>
|
||||
<ok to="stats"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="stats">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateStatsJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -18,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.Field;
|
|||
|
||||
public class DatePicker {
|
||||
|
||||
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||
public static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
||||
private static final int YEAR_LB = 1300;
|
||||
private static final int YEAR_UB = Year.now().getValue() + 5;
|
||||
|
@ -114,7 +114,7 @@ public class DatePicker {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean inRange(final String date) {
|
||||
public static boolean inRange(final String date) {
|
||||
final int year = Integer.parseInt(substringBefore(date, "-"));
|
||||
return year >= YEAR_LB && year <= YEAR_UB;
|
||||
}
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -13,15 +15,12 @@ import org.apache.spark.sql.Encoders;
|
|||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
public class DedupRecordFactory {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
|
||||
|
@ -81,11 +80,16 @@ public class DedupRecordFactory {
|
|||
|
||||
final Collection<String> dates = Lists.newArrayList();
|
||||
final List<List<Author>> authors = Lists.newArrayList();
|
||||
final List<Identifier> bestPids = Lists.newArrayList(); //best pids list
|
||||
|
||||
entities
|
||||
.forEachRemaining(
|
||||
t -> {
|
||||
T duplicate = t._2();
|
||||
|
||||
//prepare the list of pids to use for the id generation
|
||||
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
|
||||
|
||||
entity.mergeFrom(duplicate);
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result r1 = (Result) duplicate;
|
||||
|
@ -94,6 +98,7 @@ public class DedupRecordFactory {
|
|||
if (r1.getDateofacceptance() != null)
|
||||
dates.add(r1.getDateofacceptance().getValue());
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
// set authors and date
|
||||
|
@ -102,10 +107,13 @@ public class DedupRecordFactory {
|
|||
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
||||
}
|
||||
|
||||
entity.setId(id);
|
||||
entity.setId(IdGenerator.generate(bestPids, id));
|
||||
|
||||
entity.setLastupdatetimestamp(ts);
|
||||
entity.setDataInfo(dataInfo);
|
||||
|
||||
return entity;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.apache.commons.lang.NullArgumentException;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
public class IdGenerator implements Serializable {
|
||||
|
||||
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||
|
||||
//pick the best pid from the list (consider date and pidtype)
|
||||
public static String generate(List<Identifier> pids, String defaultID) {
|
||||
if (pids == null || pids.size() == 0)
|
||||
return defaultID;
|
||||
|
||||
Optional<Identifier> bp = pids.stream()
|
||||
.max(Identifier::compareTo);
|
||||
|
||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" + DedupUtility.md5(bp.get().getOriginalID());
|
||||
} else {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bp.get().getPid().getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
|
||||
|
||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||
return Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||
|
||||
Optional<StructuredProperty> bp = entity.getPid().stream()
|
||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
||||
|
||||
return bp.map(structuredProperty ->
|
||||
Lists.newArrayList(new Identifier(structuredProperty, extractDate(entity, sdf), PidType.classidValueOf(structuredProperty.getQualifier().getClassid()), entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()))
|
||||
).orElseGet(() -> Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
|
||||
|
||||
}
|
||||
|
||||
//create the prefix (length = 12): dedup_+ pidType
|
||||
public static String createPrefix(String pidType) {
|
||||
|
||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
||||
|
||||
while (prefix.length() < 12) {
|
||||
prefix.append("_");
|
||||
}
|
||||
return prefix.toString().substring(0, 12);
|
||||
|
||||
}
|
||||
|
||||
//extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01
|
||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){
|
||||
|
||||
String date = "2000-01-01";
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result result = (Result) duplicate;
|
||||
if (isWellformed(result.getDateofacceptance())){
|
||||
date = result.getDateofacceptance().getValue();
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return sdf.parse(date);
|
||||
} catch (ParseException e) {
|
||||
return new Date();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static boolean isWellformed(Field<String> date) {
|
||||
return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
public class Identifier implements Serializable, Comparable<Identifier>{
|
||||
|
||||
StructuredProperty pid;
|
||||
Date date;
|
||||
PidType type;
|
||||
List<KeyValue> collectedFrom;
|
||||
EntityType entityType;
|
||||
String originalID;
|
||||
|
||||
boolean useOriginal = false; //to know if the top identifier won because of the alphabetical order of the original ID
|
||||
|
||||
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, EntityType entityType, String originalID) {
|
||||
this.pid = pid;
|
||||
this.date = date;
|
||||
this.type = type;
|
||||
this.collectedFrom = collectedFrom;
|
||||
this.entityType = entityType;
|
||||
this.originalID = originalID;
|
||||
}
|
||||
|
||||
public StructuredProperty getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(StructuredProperty pidValue) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(Date date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PidType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(PidType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
|
||||
public EntityType getEntityType() {
|
||||
return entityType;
|
||||
}
|
||||
|
||||
public void setEntityType(EntityType entityType) {
|
||||
this.entityType = entityType;
|
||||
}
|
||||
|
||||
public String getOriginalID() {
|
||||
return originalID;
|
||||
}
|
||||
|
||||
public void setOriginalID(String originalID) {
|
||||
this.originalID = originalID;
|
||||
}
|
||||
|
||||
public boolean isUseOriginal() {
|
||||
return useOriginal;
|
||||
}
|
||||
|
||||
public void setUseOriginal(boolean useOriginal) {
|
||||
this.useOriginal = useOriginal;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Identifier i) {
|
||||
//priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) alphabetical order of the originalID
|
||||
if (this.getType().compareTo(i.getType()) == 0){ //same type
|
||||
if (entityType == EntityType.publication) {
|
||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||
return -1;
|
||||
}
|
||||
if (entityType == EntityType.dataset) {
|
||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
|
||||
return 1;
|
||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (this.getDate().compareTo(date) == 0) {//same date
|
||||
|
||||
if (this.originalID.compareTo(i.originalID) > 0)
|
||||
this.useOriginal = true;
|
||||
else
|
||||
i.setUseOriginal(true);
|
||||
|
||||
//the minus because we need to take the alphabetically lower id
|
||||
return -this.originalID.compareTo(i.originalID);
|
||||
}
|
||||
else
|
||||
//the minus is because we need to take the elder date
|
||||
return -this.getDate().compareTo(date);
|
||||
}
|
||||
else {
|
||||
return this.getType().compareTo(i.getType());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId){
|
||||
|
||||
for(KeyValue cf: collectedFrom) {
|
||||
if(cf.getKey().equals(dsId))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
//from the less to the more important
|
||||
undefined,
|
||||
original,
|
||||
orcid,
|
||||
ror,
|
||||
grid,
|
||||
pdb,
|
||||
arXiv,
|
||||
pmid,
|
||||
doi;
|
||||
|
||||
public static PidType classidValueOf(String s){
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
}
|
||||
catch (Exception e) {
|
||||
return PidType.undefined;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -22,10 +22,13 @@ public class EntityMergerTest implements Serializable {
|
|||
|
||||
List<Tuple2<String, Publication>> publications;
|
||||
List<Tuple2<String, Publication>> publications2;
|
||||
List<Tuple2<String, Publication>> publications3;
|
||||
List<Tuple2<String, Publication>> publications4;
|
||||
List<Tuple2<String, Publication>> publications5;
|
||||
|
||||
String testEntityBasePath;
|
||||
DataInfo dataInfo;
|
||||
String dedupId = "dedup_id";
|
||||
String dedupId = "00|dedup_id::1";
|
||||
Publication pub_top;
|
||||
|
||||
@BeforeEach
|
||||
|
@ -38,6 +41,9 @@ public class EntityMergerTest implements Serializable {
|
|||
|
||||
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
||||
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
||||
publications3 = readSample(testEntityBasePath + "/publication_merge3.json", Publication.class);
|
||||
publications4 = readSample(testEntityBasePath + "/publication_merge4.json", Publication.class);
|
||||
publications5 = readSample(testEntityBasePath + "/publication_merge5.json", Publication.class);
|
||||
|
||||
pub_top = getTopPub(publications);
|
||||
|
||||
|
@ -54,6 +60,9 @@ public class EntityMergerTest implements Serializable {
|
|||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||
|
||||
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
||||
|
||||
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -62,7 +71,8 @@ public class EntityMergerTest implements Serializable {
|
|||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
assertEquals(dedupId, pub_merged.getId());
|
||||
// verify id
|
||||
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||
|
||||
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
||||
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
|
||||
|
@ -117,8 +127,43 @@ public class EntityMergerTest implements Serializable {
|
|||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
|
||||
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||
// insert assertions here
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals( "50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
||||
|
||||
// verify id
|
||||
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,22 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -35,16 +25,19 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
|||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class SparkDedupTest implements Serializable {
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -9,37 +9,6 @@
|
|||
|
||||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
|
@ -92,13 +61,6 @@
|
|||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.json4s</groupId>
|
||||
<artifactId>json4s-jackson_2.11</artifactId>
|
||||
<version>3.5.3</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ import java.util.Optional;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -43,12 +42,6 @@ public class GraphHiveTableImporterJob {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
int numPartitions = Optional
|
||||
.ofNullable(parser.get("numPartitions"))
|
||||
.map(Integer::valueOf)
|
||||
.orElse(-1);
|
||||
log.info("numPartitions: {}", numPartitions);
|
||||
|
||||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
|
@ -67,21 +60,16 @@ public class GraphHiveTableImporterJob {
|
|||
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz, numPartitions));
|
||||
conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz));
|
||||
}
|
||||
|
||||
// protected for testing
|
||||
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
|
||||
Class<T> clazz, int numPartitions) {
|
||||
Class<T> clazz) {
|
||||
|
||||
Dataset<String> dataset = spark.read().textFile(inputPath);
|
||||
|
||||
if (numPartitions > 0) {
|
||||
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
|
||||
dataset = dataset.repartition(numPartitions);
|
||||
}
|
||||
|
||||
dataset
|
||||
spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -1,162 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
|
||||
* are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
|
||||
* by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
|
||||
*/
|
||||
public class MergeGraphSparkJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanGraphSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
String priority = Optional
|
||||
.ofNullable(parser.get("priority"))
|
||||
.orElse(PRIORITY_DEFAULT);
|
||||
log.info("priority: {}", priority);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String betaInputPath = parser.get("betaInputPath");
|
||||
log.info("betaInputPath: {}", betaInputPath);
|
||||
|
||||
String prodInputPath = parser.get("prodInputPath");
|
||||
log.info("prodInputPath: {}", prodInputPath);
|
||||
|
||||
String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
|
||||
SparkSession spark,
|
||||
String priority,
|
||||
String betaInputPath,
|
||||
String prodInputPath,
|
||||
Class<P> p_clazz,
|
||||
Class<B> b_clazz,
|
||||
String outputPath) {
|
||||
|
||||
Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
|
||||
Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
|
||||
|
||||
prod
|
||||
.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
|
||||
.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
|
||||
Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
|
||||
Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
|
||||
switch (priority) {
|
||||
default:
|
||||
case "BETA":
|
||||
return mergeWithPriorityToBETA(p, b);
|
||||
case "PROD":
|
||||
return mergeWithPriorityToPROD(p, b);
|
||||
}
|
||||
}, Encoders.bean(p_clazz))
|
||||
.filter((FilterFunction<P>) Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
|
||||
if (b.isPresent() & !p.isPresent()) {
|
||||
return (P) b.get();
|
||||
}
|
||||
if (p.isPresent()) {
|
||||
return p.get();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToBETA(Optional<P> p, Optional<B> b) {
|
||||
if (p.isPresent() & !b.isPresent()) {
|
||||
return p.get();
|
||||
}
|
||||
if (b.isPresent()) {
|
||||
return (P) b.get();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
|
||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||
|
||||
log.info("Reading Graph table from: {}", inputEntityPath);
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputEntityPath)
|
||||
.map(
|
||||
(MapFunction<String, Tuple2<String, T>>) value -> {
|
||||
final T t = OBJECT_MAPPER.readValue(value, clazz);
|
||||
final String id = ModelSupport.idFn().apply(t);
|
||||
return new Tuple2<>(id, t);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,10 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.NOT_AVAILABLE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
|
@ -14,8 +40,24 @@ import org.dom4j.Node;
|
|||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public abstract class AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -57,6 +99,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final Document doc = DocumentHelper
|
||||
.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
|
||||
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
final KeyValue collectedFrom = getProvenanceDatasource(
|
||||
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
|
||||
|
||||
|
@ -75,39 +118,12 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final DataInfo info = prepareDataInfo(doc, invisible);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
||||
|
||||
final String type = getResultType(doc, instances);
|
||||
|
||||
return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected String getResultType(final Document doc, final List<Instance> instances) {
|
||||
String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
|
||||
if (StringUtils.isBlank(type) & vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||
String instanceType = instances
|
||||
.stream()
|
||||
.map(i -> i.getInstancetype().getClassid())
|
||||
.findFirst()
|
||||
.map(s -> UNKNOWN.equalsIgnoreCase(s) ? "0000" : s)
|
||||
.orElse("0000"); // Unknown
|
||||
return Optional
|
||||
.ofNullable(vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType))
|
||||
.map(q -> q.getClassid())
|
||||
.orElse("0000");
|
||||
/*
|
||||
* .orElseThrow( () -> new IllegalArgumentException( String.format("'%s' not mapped in %s", instanceType,
|
||||
* DNET_RESULT_TYPOLOGIES)));
|
||||
*/
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) {
|
||||
final String dsId = doc.valueOf(xpathId);
|
||||
final String dsName = doc.valueOf(xpathName);
|
||||
|
@ -122,8 +138,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected List<Oaf> createOafs(
|
||||
final Document doc,
|
||||
final String type,
|
||||
final List<Instance> instances,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
|
||||
|
@ -132,14 +148,14 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
switch (type.toLowerCase()) {
|
||||
case "publication":
|
||||
final Publication p = new Publication();
|
||||
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
|
||||
p.setJournal(prepareJournal(doc, info));
|
||||
oafs.add(p);
|
||||
break;
|
||||
case "dataset":
|
||||
final Dataset d = new Dataset();
|
||||
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
|
||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||
d.setDevice(prepareDatasetDevice(doc, info));
|
||||
|
@ -152,7 +168,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
break;
|
||||
case "software":
|
||||
final Software s = new Software();
|
||||
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
|
||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||
|
@ -164,7 +180,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
case "otherresearchproducts":
|
||||
default:
|
||||
final OtherResearchProduct o = new OtherResearchProduct();
|
||||
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
o.setResulttype(ORP_DEFAULT_RESULTTYPE);
|
||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||
|
@ -243,16 +259,14 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
private void populateResultFields(
|
||||
final Result r,
|
||||
final Document doc,
|
||||
final List<Instance> instances,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
||||
|
||||
r.setOriginalId(Arrays.asList(findOriginalId(doc)));
|
||||
|
||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||
r.setPid(prepareResultPids(doc, info));
|
||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
||||
|
@ -277,7 +291,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
r.setCoverage(prepareCoverages(doc, info));
|
||||
r.setContext(prepareContexts(doc, info));
|
||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
|
||||
final List<Instance> instances = prepareInstances(doc, info, collectedFrom, hostedBy);
|
||||
r.setInstance(instances);
|
||||
r.setBestaccessright(getBestAccessRights(instances));
|
||||
}
|
||||
|
@ -415,18 +429,6 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return null;
|
||||
}
|
||||
|
||||
private String findOriginalId(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||
if (n != null) {
|
||||
final String id = n.valueOf("./*[local-name()='identifier']");
|
||||
if (StringUtils.isNotBlank(id)) {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
return doc.valueOf("//*[local-name()='header']/*[local-name()='identifier']");
|
||||
|
||||
}
|
||||
|
||||
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId) {
|
||||
return prepareQualifier(node.valueOf(xpath).trim(), schemeId);
|
||||
}
|
||||
|
|
|
@ -4,11 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -61,7 +57,6 @@ public class OafMapperUtils {
|
|||
.stream(values)
|
||||
.map(v -> field(v, info))
|
||||
.filter(Objects::nonNull)
|
||||
.filter(distinctByKey(f -> f.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
@ -70,7 +65,6 @@ public class OafMapperUtils {
|
|||
.stream()
|
||||
.map(v -> field(v, info))
|
||||
.filter(Objects::nonNull)
|
||||
.filter(distinctByKey(f -> f.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
@ -243,10 +237,4 @@ public class OafMapperUtils {
|
|||
public static String asString(final Object o) {
|
||||
return o == null ? "" : o.toString();
|
||||
}
|
||||
|
||||
public static <T> Predicate<T> distinctByKey(
|
||||
final Function<? super T, ?> keyExtractor) {
|
||||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
|
||||
object EBIAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
|
||||
|
||||
override def zero: OafDataset = new OafDataset()
|
||||
|
||||
override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: OafDataset): OafDataset = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
|
||||
override def outputEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
}
|
||||
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
|
||||
def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
|
||||
|
||||
override def zero: Relation = new Relation()
|
||||
|
||||
override def reduce(b: Relation, a: (String, Relation)): Relation = {
|
||||
a._2
|
||||
}
|
||||
|
||||
|
||||
override def merge(a: Relation, b: Relation): Relation = {
|
||||
if(b!= null) b else a
|
||||
}
|
||||
override def finish(reduction: Relation): Relation = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
|
||||
override def outputEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,138 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkAddLinkUpdates {
|
||||
|
||||
val relationMapper = RelationMapper.load
|
||||
|
||||
|
||||
case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
|
||||
|
||||
|
||||
def generatePubmedDLICollectedFrom(): KeyValue = {
|
||||
OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
|
||||
}
|
||||
|
||||
|
||||
def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
|
||||
val pmid :String = input._1
|
||||
val input_json :String = input._2
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input_json)
|
||||
|
||||
|
||||
val targets:List[EBILinks] = for {
|
||||
JObject(link) <- json \\ "Category" \\ "Link"
|
||||
JField("PublicationDate", JString(pubdate)) <- link
|
||||
JField("RelationshipType", JObject(relationshipType)) <- link
|
||||
JField("Name", JString(relname)) <- relationshipType
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("ID", JString(tpid)) <- identifier
|
||||
JField("IDScheme", JString(tpidtype)) <- identifier
|
||||
JField("IDURL", JString(turl)) <- identifier
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Publisher", JObject(pub)) <- target
|
||||
JField("Name", JString(publisher)) <- pub
|
||||
} yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
|
||||
|
||||
|
||||
|
||||
val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
|
||||
|
||||
targets.flatMap(l => {
|
||||
val relation = new DLIRelation
|
||||
val inverseRelation = new DLIRelation
|
||||
val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
|
||||
val relInfo = relationMapper.get(l.relation.toLowerCase)
|
||||
val relationSemantic = relInfo.getOriginal
|
||||
val inverseRelationSemantic = relInfo.getInverse
|
||||
|
||||
relation.setSource(dnetPublicationId)
|
||||
relation.setTarget(targetDnetId)
|
||||
relation.setRelClass("datacite")
|
||||
relation.setRelType(relationSemantic)
|
||||
relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
inverseRelation.setSource(targetDnetId)
|
||||
inverseRelation.setTarget(dnetPublicationId)
|
||||
inverseRelation.setRelClass("datacite")
|
||||
inverseRelation.setRelType(inverseRelationSemantic)
|
||||
inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
|
||||
|
||||
val d = new DLIDataset
|
||||
d.setId(targetDnetId)
|
||||
d.setDataInfo(OafUtils.generateDataInfo())
|
||||
d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
|
||||
d.setCompletionStatus("complete")
|
||||
val pi = new ProvenaceInfo
|
||||
pi.setId("dli_________::europe_pmc__")
|
||||
pi.setName( "Europe PMC")
|
||||
pi.setCompletionStatus("complete")
|
||||
pi.setCollectionMode("collected")
|
||||
d.setDlicollectedfrom(List(pi).asJava)
|
||||
d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
d.setPublisher(OafUtils.asField(l.publisher))
|
||||
d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
|
||||
d.setDateofacceptance(OafUtils.asField(l.pubdate))
|
||||
val i = new Instance
|
||||
i.setCollectedfrom(generatePubmedDLICollectedFrom())
|
||||
i.setDateofacceptance(d.getDateofacceptance)
|
||||
i.setUrl(List(l.turl).asJava)
|
||||
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
|
||||
d.setInstance(List(i).asJava)
|
||||
List(relation, inverseRelation, d)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
|
||||
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
|
||||
|
||||
val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
|
||||
ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
|
||||
|
||||
ds.filter(s => s.isInstanceOf)
|
||||
|
||||
|
||||
|
||||
val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
|
||||
|
||||
oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
|
||||
oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
|
||||
|
||||
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
|
||||
|
||||
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
|
||||
} ))
|
||||
|
||||
ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -1,87 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkCreateEBIDataFrame {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
val relationMapper = RelationMapper.load
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
|
||||
logger.info("Extract Publication and relation from publication_xml")
|
||||
val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new PublicationScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
|
||||
|
||||
logger.info("Extract Publication and relation from dataset_xml")
|
||||
val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new DatasetScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
|
||||
val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
|
||||
val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
|
||||
val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
|
||||
publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
|
||||
|
||||
dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDatasetAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
|
||||
|
||||
relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
|
||||
|
||||
|
||||
|
||||
relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
private String pmid;
|
||||
private String date;
|
||||
private PMJournal journal;
|
||||
private String title;
|
||||
private String description;
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PMAuthor implements Serializable {
|
||||
|
||||
private String lastName;
|
||||
private String foreName;
|
||||
|
||||
public String getLastName() {
|
||||
return lastName;
|
||||
}
|
||||
|
||||
public void setLastName(String lastName) {
|
||||
this.lastName = lastName;
|
||||
}
|
||||
|
||||
public String getForeName() {
|
||||
return foreName;
|
||||
}
|
||||
|
||||
public void setForeName(String foreName) {
|
||||
this.foreName = foreName;
|
||||
}
|
||||
|
||||
public String getFullName() {
|
||||
return String.format("%s, %s", this.foreName, this.lastName);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PMJournal implements Serializable {
|
||||
|
||||
private String issn;
|
||||
private String volume;
|
||||
private String issue;
|
||||
private String date;
|
||||
private String title;
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getVolume() {
|
||||
return volume;
|
||||
}
|
||||
|
||||
public void setVolume(String volume) {
|
||||
this.volume = volume;
|
||||
}
|
||||
|
||||
public String getIssue() {
|
||||
return issue;
|
||||
}
|
||||
|
||||
public void setIssue(String issue) {
|
||||
this.issue = issue;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi.model
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle:PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle!= null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
|
||||
def generateNextArticle():PMArticle = {
|
||||
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, _, _) =>
|
||||
currNode = label
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode!= null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle==null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription==null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume( text.trim)
|
||||
case "Issue" => currentJournal.setIssue (text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
|
||||
}
|
||||
case "ForeName" => if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle==null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -150,17 +150,6 @@ public abstract class AbstractScholexplorerParser {
|
|||
return uk;
|
||||
}
|
||||
|
||||
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
|
||||
final String schemeName) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classId);
|
||||
q.setClassid(className);
|
||||
q.setSchemeid(schemeId);
|
||||
q.setSchemename(schemeName);
|
||||
return q;
|
||||
|
||||
}
|
||||
|
||||
protected void generateRelations(
|
||||
RelationMapper relationMapper,
|
||||
Result parsedObject,
|
||||
|
|
|
@ -64,6 +64,7 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
|
||||
final String completionStatus = VtdUtilityParser
|
||||
.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
|
||||
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
@ -148,37 +149,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
|
||||
String resolvedURL = null;
|
||||
|
||||
switch (currentPid.getQualifier().getClassname().toLowerCase()) {
|
||||
case "uniprot":
|
||||
resolvedURL = "https://www.uniprot.org/uniprot/" + currentPid.getValue();
|
||||
break;
|
||||
case "ena":
|
||||
if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
|
||||
resolvedURL = "https://www.ebi.ac.uk/ena/data/view/" + currentPid.getValue().substring(0, 8);
|
||||
break;
|
||||
case "chembl":
|
||||
resolvedURL = "https://www.ebi.ac.uk/chembl/compound_report_card/" + currentPid.getValue();
|
||||
break;
|
||||
|
||||
case "ncbi-n":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "ncbi-p":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "genbank":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "pdb":
|
||||
resolvedURL = "https://www.ncbi.nlm.nih.gov/nuccore/" + currentPid.getValue();
|
||||
break;
|
||||
case "url":
|
||||
resolvedURL = currentPid.getValue();
|
||||
break;
|
||||
}
|
||||
|
||||
final String sourceId = generateId(
|
||||
currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
|
||||
parsedObject.setId(sourceId);
|
||||
|
@ -281,11 +251,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st
|
||||
.setQualifier(
|
||||
generateQualifier(
|
||||
"main title", "main title", "dnet:dataCite_title",
|
||||
"dnet:dataCite_title"));
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
@ -317,13 +282,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedURL)) {
|
||||
Instance i = new Instance();
|
||||
i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
|
||||
i.setUrl(Collections.singletonList(resolvedURL));
|
||||
parsedObject.setInstance(Collections.singletonList(i));
|
||||
}
|
||||
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
|
|
|
@ -202,11 +202,6 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
|
|||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st
|
||||
.setQualifier(
|
||||
generateQualifier(
|
||||
"main title", "main title", "dnet:dataCite_title",
|
||||
"dnet:dataCite_title"));
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
|
|
@ -282,7 +282,6 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>100</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -5,12 +5,6 @@
|
|||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "np",
|
||||
"paramLongName": "numPartitions",
|
||||
"paramDescription": "number of dataset partitions",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "in",
|
||||
"paramLongName": "inputPath",
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,293 +0,0 @@
|
|||
<workflow-app name="merge graphs" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>betaInputGgraphPath</name>
|
||||
<description>the beta graph root path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>prodInputGgraphPath</name>
|
||||
<description>the production graph root path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphOutputPath</name>
|
||||
<description>the output merged graph root path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>priority</name>
|
||||
<description>decides from which infrastructure the content must win in case of ID clash</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="fork_merge_graph"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<fork name="fork_merge_graph">
|
||||
<path start="merge_publication"/>
|
||||
<path start="merge_dataset"/>
|
||||
<path start="merge_otherresearchproduct"/>
|
||||
<path start="merge_software"/>
|
||||
<path start="merge_datasource"/>
|
||||
<path start="merge_organization"/>
|
||||
<path start="merge_project"/>
|
||||
<path start="merge_relation"/>
|
||||
</fork>
|
||||
|
||||
<action name="merge_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge publications</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/publication</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge datasets</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/dataset</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_otherresearchproduct">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge otherresearchproducts</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/otherresearchproduct</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge softwares</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/software</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_datasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge datasources</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/datasource</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/datasource</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_organization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge organizations</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/organization</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/organization</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge projects</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/project</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/project</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge relations</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.merge.MergeGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--betaInputPath</arg><arg>${betaInputGgraphPath}/relation</arg>
|
||||
<arg>--prodInputPath</arg><arg>${prodInputGgraphPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
<arg>--priority</arg><arg>${priority}</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_merge" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,38 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "bin",
|
||||
"paramLongName": "betaInputPath",
|
||||
"paramDescription": "the beta graph root path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pin",
|
||||
"paramLongName": "prodInputPath",
|
||||
"paramDescription": "the production graph root path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the output merged graph root path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "class",
|
||||
"paramLongName": "graphTableClassName",
|
||||
"paramDescription": "class name moelling the graph table",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pr",
|
||||
"paramLongName": "priority",
|
||||
"paramDescription": "decides from which infrastructure the content must win in case of ID clash",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -1,4 +0,0 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}
|
||||
]
|
|
@ -1,68 +0,0 @@
|
|||
<configuration>
|
||||
|
||||
<!-- OCEAN -->
|
||||
<!--
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- GARR -->
|
||||
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarn</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
||||
</property>
|
||||
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,97 +0,0 @@
|
|||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="GenerateUpdates"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="GenerateBaselineDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Baselnie DataSet</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=1
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateUpdates">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Baselnie DataSet</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=1
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateEBIDataSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create EBI DataSet</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=1000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -5,7 +5,8 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
@ -19,9 +20,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
|||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
|
@ -32,25 +31,24 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class MappersTest {
|
||||
|
||||
@Mock
|
||||
private ISLookUpService isLookUpService;
|
||||
|
||||
@Mock
|
||||
private VocabularyGroup vocs;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||
lenient()
|
||||
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||
.thenReturn(synonyms());
|
||||
when(vocs.getTermAsQualifier(anyString(), anyString()))
|
||||
.thenAnswer(
|
||||
invocation -> OafMapperUtils
|
||||
.qualifier(
|
||||
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
|
||||
invocation.getArgument(0)));
|
||||
|
||||
when(vocs.termExists(anyString(), anyString())).thenReturn(true);
|
||||
|
||||
vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -70,14 +68,9 @@ public class MappersTest {
|
|||
final Relation r2 = (Relation) list.get(2);
|
||||
|
||||
assertValidId(p.getId());
|
||||
|
||||
assertTrue(p.getOriginalId().size() == 1);
|
||||
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
|
||||
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
assertFalse(p.getDataInfo().getInvisible());
|
||||
assertTrue(p.getSource().size() == 1);
|
||||
|
||||
assertTrue(p.getAuthor().size() > 0);
|
||||
final Optional<Author> author = p
|
||||
|
@ -86,7 +79,6 @@ public class MappersTest {
|
|||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||
.findFirst();
|
||||
assertTrue(author.isPresent());
|
||||
|
||||
final StructuredProperty pid = author
|
||||
.get()
|
||||
.getPid()
|
||||
|
@ -177,8 +169,6 @@ public class MappersTest {
|
|||
final Relation r2 = (Relation) list.get(2);
|
||||
|
||||
assertValidId(d.getId());
|
||||
assertTrue(d.getOriginalId().size() == 1);
|
||||
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
|
||||
assertValidId(d.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
||||
assertTrue(d.getAuthor().size() > 0);
|
||||
|
@ -265,32 +255,10 @@ public class MappersTest {
|
|||
assertTrue(s.getInstance().size() > 0);
|
||||
}
|
||||
|
||||
// @Test
|
||||
void testDataset_2() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset_2.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||
System.out.println("***************");
|
||||
}
|
||||
|
||||
private void assertValidId(final String id) {
|
||||
assertEquals(49, id.length());
|
||||
assertEquals('|', id.charAt(2));
|
||||
assertEquals(':', id.charAt(15));
|
||||
assertEquals(':', id.charAt(16));
|
||||
}
|
||||
|
||||
private List<String> vocs() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||
}
|
||||
|
||||
private List<String> synonyms() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class TestEBI {
|
||||
|
||||
|
||||
|
||||
// @Test
|
||||
def testEBIData() = {
|
||||
SparkAddLinkUpdates.main("-mt local[*] -w /home/sandro/Downloads".split(" "))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -34,8 +34,6 @@
|
|||
<dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
|
||||
<dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:subject>Ecosystem Services hotspots</dc:subject>
|
||||
<dc:subject>Natura 2000</dc:subject>
|
||||
<dc:subject>Quiet Protected Areas</dc:subject>
|
||||
|
@ -49,8 +47,7 @@
|
|||
<dc:subject>regulating services</dc:subject>
|
||||
<dc:subject>supporting services</dc:subject>
|
||||
<dc:type>Research Article</dc:type>
|
||||
<!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
|
||||
<dr:CobjCategory>0001</dr:CobjCategory>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
|
||||
<oaf:projectid>corda_______::226852</oaf:projectid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
|
|
|
@ -82,8 +82,7 @@
|
|||
<p>All files are in MATLAB .mat format.</p></description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<!--<dr:CobjCategory type="dataset">0021</dr:CobjCategory>-->
|
||||
<dr:CobjCategory>0021</dr:CobjCategory>
|
||||
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2019-01-01</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:language>und</oaf:language>
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<oai:header>
|
||||
<dri:objIdentifier>opentrials__::0000bf8e63d3d7e6b88421eabafae3f6</dri:objIdentifier>
|
||||
<dri:recordIdentifier>feabb67c-1fd1-423b-aec6-606d04ce53c6</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>opentrials__</oaf:datasourceprefix>
|
||||
<dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
|
||||
</oai:header>
|
||||
<oai:metadata>
|
||||
<resource xmlns="http://datacite.org/schema/kernel-3"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
|
||||
<identifier identifierType="URL">https://clinicaltrials.gov/ct2/show/NCT02321059</identifier>
|
||||
<alternateIdentifiers>
|
||||
<alternateIdentifier alternateIdentifierType="URL">http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059</alternateIdentifier>
|
||||
<alternateIdentifier alternateIdentifierType="nct">NCT02321059</alternateIdentifier>
|
||||
</alternateIdentifiers>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Jensen, Kristian K</creatorName>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>Validation of the Goodstrength System for Assessment of Abdominal Wall Strength in Patients With Incisional Hernia</title>
|
||||
</titles>
|
||||
<publisher>nct</publisher>
|
||||
<geoLocations>
|
||||
<geoLocationPlace>Denmark</geoLocationPlace>
|
||||
</geoLocations>
|
||||
<resourceType resourceTypeGeneral="clinicalTrial">0037</resourceType>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract">Patients with an incisional hernia in the midline and controls with an intact abdominal wall are examined twice with one week apart, in order to establish the test-retest reliability and internal and external validity of the Goodstrength trunk dynamometer.</description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<dr:CobjCategory type="dataset">0037</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2014-11-11</oaf:dateAccepted>
|
||||
<oaf:hostedBy id="openaire____::opentrials" name="OpenTrials"/>
|
||||
<oaf:collectedFrom id="openaire____::opentrials" name="OpenTrials"/>
|
||||
<oaf:about>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction
|
||||
classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</oaf:about>
|
||||
</oai:metadata>
|
||||
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2019-03-27T15:15:22.22Z">
|
||||
<baseURL>file:///var/lib/dnet/data/opentrials/opentrials.csv</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</oai:record>
|
|
@ -52,8 +52,7 @@
|
|||
subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3534">Protein binding sites</datacite:subject>
|
||||
</datacite:subjects>
|
||||
</datacite:resource>
|
||||
<!--<dr:CobjCategory type="software">0029</dr:CobjCategory>-->
|
||||
<dr:CobjCategory>0029</dr:CobjCategory>
|
||||
<dr:CobjCategory type="software">0029</dr:CobjCategory>
|
||||
<oaf:hostedBy id="rest________::bioTools" name="bio.tools"/>
|
||||
<oaf:collectedFrom id="rest________::bioTools" name="bio.tools"/>
|
||||
<oaf:dateAccepted>2018-06-06</oaf:dateAccepted>
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
{
|
||||
"Category": [
|
||||
{
|
||||
"Section": [
|
||||
{
|
||||
"Linklist": {
|
||||
"Link": [
|
||||
{
|
||||
"LinkProvider": {
|
||||
"Name": "Europe PMC"
|
||||
},
|
||||
"Target": {
|
||||
"Publisher": {
|
||||
"Name": "Altmetric"
|
||||
},
|
||||
"ImageURL": "https://api.altmetric.com/v1/donut/58578459_64.png",
|
||||
"Identifier": {
|
||||
"ID": "https://www.altmetric.com/details/58578459",
|
||||
"IDScheme": "URL",
|
||||
"IDURL": "https://www.altmetric.com/details/58578459"
|
||||
},
|
||||
"Type": {
|
||||
"Name": "dataset"
|
||||
},
|
||||
"Title": "Optical clumped isotope thermometry of carbon dioxide"
|
||||
},
|
||||
"Source": {
|
||||
"Identifier": {
|
||||
"ID": "30886173",
|
||||
"IDScheme": "PMID"
|
||||
},
|
||||
"Type": {
|
||||
"Name": "literature"
|
||||
}
|
||||
},
|
||||
"PublicationDate": "06-04-2019",
|
||||
"RelationshipType": {
|
||||
"Name": "IsReferencedBy"
|
||||
},
|
||||
"ObtainedBy": "ext_links"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ObtainedBy": "ext_links",
|
||||
"SectionLinkCount": 1,
|
||||
"Tags": [
|
||||
"altmetrics"
|
||||
]
|
||||
}
|
||||
],
|
||||
"CategoryLinkCount": 1,
|
||||
"Name": "Altmetric"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,191 +0,0 @@
|
|||
{
|
||||
"version": "6.3",
|
||||
"hitCount": 4,
|
||||
"request": {
|
||||
"id": "28818901",
|
||||
"source": "MED"
|
||||
},
|
||||
"dataLinkList": {
|
||||
"Category": [
|
||||
{
|
||||
"Name": "Nucleotide Sequences",
|
||||
"CategoryLinkCount": 3,
|
||||
"Section": [
|
||||
{
|
||||
"ObtainedBy": "tm_accession",
|
||||
"Tags": [
|
||||
"supporting_data"
|
||||
],
|
||||
"SectionLinkCount": 1,
|
||||
"Linklist": {
|
||||
"Link": [
|
||||
{
|
||||
"ObtainedBy": "tm_accession",
|
||||
"PublicationDate": "27-02-2020",
|
||||
"LinkProvider": {
|
||||
"Name": "Europe PMC"
|
||||
},
|
||||
"RelationshipType": {
|
||||
"Name": "References"
|
||||
},
|
||||
"Source": {
|
||||
"Type": {
|
||||
"Name": "literature"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "28818901",
|
||||
"IDScheme": "MED"
|
||||
}
|
||||
},
|
||||
"Target": {
|
||||
"Type": {
|
||||
"Name": "dataset"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "AP008937",
|
||||
"IDScheme": "ENA",
|
||||
"IDURL": "http://identifiers.org/ena.embl/AP008937"
|
||||
},
|
||||
"Title": "AP008937",
|
||||
"Publisher": {
|
||||
"Name": "Europe PMC"
|
||||
}
|
||||
},
|
||||
"Frequency": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"ObtainedBy": "submission",
|
||||
"Tags": [
|
||||
"related_data"
|
||||
],
|
||||
"SectionLinkCount": 2,
|
||||
"CollectionURL": "http://www.ebi.ac.uk/ena/data/search?query=28818901",
|
||||
"Linklist": {
|
||||
"Link": [
|
||||
{
|
||||
"ObtainedBy": "submission",
|
||||
"PublicationDate": "25-06-2018",
|
||||
"LinkProvider": {
|
||||
"Name": "Europe PMC"
|
||||
},
|
||||
"RelationshipType": {
|
||||
"Name": "IsReferencedBy"
|
||||
},
|
||||
"Source": {
|
||||
"Type": {
|
||||
"Name": "literature"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "28818901",
|
||||
"IDScheme": "PMID"
|
||||
}
|
||||
},
|
||||
"Target": {
|
||||
"Type": {
|
||||
"Name": "dataset"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "NIWV01000000",
|
||||
"IDScheme": "ENA",
|
||||
"IDURL": "http://www.ebi.ac.uk/ena/data/view/NIWV01000000"
|
||||
},
|
||||
"Title": "Nucleotide sequences",
|
||||
"Publisher": {
|
||||
"Name": "ENA"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"ObtainedBy": "submission",
|
||||
"PublicationDate": "25-06-2018",
|
||||
"LinkProvider": {
|
||||
"Name": "Europe PMC"
|
||||
},
|
||||
"RelationshipType": {
|
||||
"Name": "IsReferencedBy"
|
||||
},
|
||||
"Source": {
|
||||
"Type": {
|
||||
"Name": "literature"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "28818901",
|
||||
"IDScheme": "PMID"
|
||||
}
|
||||
},
|
||||
"Target": {
|
||||
"Type": {
|
||||
"Name": "dataset"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "PRJNA390617",
|
||||
"IDScheme": "ENA",
|
||||
"IDURL": "http://www.ebi.ac.uk/ena/data/view/PRJNA390617"
|
||||
},
|
||||
"Title": "Lactobacillus fermentum strain:BFE 6620",
|
||||
"Publisher": {
|
||||
"Name": "ENA"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"Name": "BioStudies: supplemental material and supporting data",
|
||||
"CategoryLinkCount": 1,
|
||||
"Section": [
|
||||
{
|
||||
"ObtainedBy": "ext_links",
|
||||
"Tags": [
|
||||
"supporting_data"
|
||||
],
|
||||
"SectionLinkCount": 1,
|
||||
"Linklist": {
|
||||
"Link": [
|
||||
{
|
||||
"ObtainedBy": "ext_links",
|
||||
"PublicationDate": "24-07-2018",
|
||||
"LinkProvider": {
|
||||
"Name": "Europe PMC"
|
||||
},
|
||||
"RelationshipType": {
|
||||
"Name": "IsReferencedBy"
|
||||
},
|
||||
"Source": {
|
||||
"Type": {
|
||||
"Name": "literature"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "28818901",
|
||||
"IDScheme": "PMID"
|
||||
}
|
||||
},
|
||||
"Target": {
|
||||
"Type": {
|
||||
"Name": "dataset"
|
||||
},
|
||||
"Identifier": {
|
||||
"ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true",
|
||||
"IDScheme": "URL",
|
||||
"IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true"
|
||||
},
|
||||
"Title": "Draft Genome Sequence of Lactobacillus fermentum BFE 6620, a Potential Starter Culture for African Vegetable Foods, Isolated from Fermented Cassava.",
|
||||
"Publisher": {
|
||||
"Name": "BioStudies: supplemental material and supporting data"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -5,12 +5,11 @@ import java.time.format.DateTimeFormatter
|
|||
|
||||
import eu.dnetlib.dhp.common.PacePerson
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
@ -100,20 +99,6 @@ object DLIToOAF {
|
|||
)
|
||||
|
||||
|
||||
def fixInstance(r:Publication) :Publication = {
|
||||
val collectedFrom = r.getCollectedfrom.asScala.head
|
||||
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def fixInstanceDataset(r:Dataset) :Dataset = {
|
||||
val collectedFrom = r.getCollectedfrom.asScala.head
|
||||
r.getInstance().asScala.foreach(i => i.setCollectedfrom(collectedFrom))
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
|
@ -427,6 +412,46 @@ object DLIToOAF {
|
|||
}
|
||||
|
||||
|
||||
def generateKeyValue(key: String, value: String): KeyValue = {
|
||||
val kv: KeyValue = new KeyValue()
|
||||
kv.setKey(key)
|
||||
kv.setValue(value)
|
||||
kv.setDataInfo(generateDataInfo("0.9"))
|
||||
kv
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||
di
|
||||
}
|
||||
|
||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||
createQualifier(cls, cls, sch, sch)
|
||||
}
|
||||
|
||||
|
||||
def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
|
||||
val q: Qualifier = new Qualifier
|
||||
q.setClassid(classId)
|
||||
q.setClassname(className)
|
||||
q.setSchemeid(schemeId)
|
||||
q.setSchemename(schemeName)
|
||||
q
|
||||
}
|
||||
|
||||
|
||||
def asField[T](value: T): Field[T] = {
|
||||
val tmp = new Field[T]
|
||||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.dhp.`export`
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.Text
|
||||
|
@ -166,13 +166,10 @@ object SparkExportContentForOpenAire {
|
|||
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS")
|
||||
|
||||
|
||||
|
||||
spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.fixInstance).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS_fixed")
|
||||
spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.fixInstanceDataset).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS_fixed")
|
||||
|
||||
val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
|
||||
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS_fixed").as[Publication].map(DLIToOAF.toActionSet)
|
||||
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS_fixed").as[OafDataset].map(DLIToOAF.toActionSet)
|
||||
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet)
|
||||
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet)
|
||||
|
||||
|
||||
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
}
|
||||
|
|
|
@ -16,15 +16,15 @@
|
|||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</configuration>
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
echo "Getting file from " $3
|
||||
hdfs dfs -copyToLocal $3
|
||||
|
||||
echo "Running impala shell make the new database visible"
|
||||
impala-shell -q "INVALIDATE METADATA;"
|
||||
|
||||
echo "Running impala shell to compute new table stats"
|
||||
impala-shell -d $1 -f $2
|
||||
echo "Impala shell finished"
|
||||
rm $2
|
|
@ -1,8 +1,11 @@
|
|||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Stats database creation
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- DROP database if EXISTS ${hive_db_name} cascade;
|
||||
-- CREATE database ${hive_db_name};
|
||||
--
|
||||
-- CREATE TABLE ${hive_db_name}.Persons (
|
||||
-- PersonID int,
|
||||
-- LastName varchar(255));
|
||||
--
|
||||
-- INSERT INTO ${hive_db_name}.Persons VALUES (1, "test_db_spyros_rec_111");
|
||||
|
||||
DROP database IF EXISTS ${stats_db_name} CASCADE;
|
||||
CREATE database ${stats_db_name};
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.context AS SELECT * FROM ${external_stats_db_name}.context;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.category AS SELECT * FROM ${external_stats_db_name}.category;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.concept AS SELECT * FROM ${external_stats_db_name}.concept;
|
||||
|
||||
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Creation date of the database
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
create table ${stats_db_name}.creation_date as select date_format(current_date(), 'dd-MM-yyyy') as date;
|
|
@ -0,0 +1,7 @@
|
|||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
-- Organization table/view and Organization related tables/views
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.organization;
|
||||
CREATE TABLE ${stats_db_name}.organization AS SELECT substr(o.id, 4) as id, o.legalname.value as name, o.country.classid as country from ${openaire_db_name}.organization o WHERE o.datainfo.deletedbyinference=false;
|
|
@ -0,0 +1 @@
|
|||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS SELECT organization AS id, id AS datasource FROM ${stats_db_name}.datasource_organizations;
|
|
@ -0,0 +1 @@
|
|||
CREATE OR REPLACE VIEW ${stats_db_name}.organization_projects AS SELECT id AS project, organization as id FROM ${stats_db_name}.project_organizations;
|
|
@ -1,44 +1,10 @@
|
|||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
-- Post processing - Updates on main tables
|
||||
----------------------------------------------------------------
|
||||
----------------------------------------------------------------
|
||||
|
||||
--Datasource temporary table updates
|
||||
UPDATE ${stats_db_name}.datasource_tmp SET harvested='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd WHERE d.id=rd.datasource);
|
||||
|
||||
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
||||
UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project;
|
||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||
SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration,
|
||||
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
|
||||
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
||||
p.callidentifier, p.code
|
||||
FROM ${stats_db_name}.project_tmp p
|
||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||
FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id
|
||||
WHERE r.type='publication'
|
||||
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
||||
LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) AS daysForlastPub , count(distinct r.id) AS dp
|
||||
FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r
|
||||
WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
||||
GROUP BY pp.id) AS prr2
|
||||
ON prr2.id = p.id;
|
||||
|
||||
-- Publication temporary table updates
|
||||
UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
||||
|
||||
-- Dataset temporary table updates
|
||||
UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
||||
|
||||
-- Software temporary table updates
|
||||
UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
||||
|
||||
-- Oherresearchproduct temporary table updates
|
||||
UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
||||
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend AS daysfromend FROM ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id AND result.type='publication' AND project.id=result_projects.project;
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
|
||||
------------------------------------------------------------------------------------------------
|
||||
------------------------------------------------------------------------------------------------
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS SELECT * FROM ${external_stats_db_name}.fundref;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.country AS SELECT * FROM ${external_stats_db_name}.country;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS SELECT * FROM ${external_stats_db_name}.countrygdp;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS SELECT * FROM ${external_stats_db_name}.roarmap;
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture;
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
------------------------------------------------------------------------------------------------------
|
||||
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
||||
------------------------------------------------------------------------------------------------------
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource;
|
||||
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS SELECT * FROM ${stats_db_name}.datasource_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication;
|
||||
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS SELECT * FROM ${stats_db_name}.publication_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset;
|
||||
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS SELECT * FROM ${stats_db_name}.dataset_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software;
|
||||
CREATE TABLE ${stats_db_name}.software stored AS parquet AS SELECT * FROM ${stats_db_name}.software_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS SELECT * FROM ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
DROP TABLE ${stats_db_name}.project_tmp;
|
||||
DROP TABLE ${stats_db_name}.datasource_tmp;
|
||||
DROP TABLE ${stats_db_name}.publication_tmp;
|
||||
DROP TABLE ${stats_db_name}.dataset_tmp;
|
||||
DROP TABLE ${stats_db_name}.software_tmp;
|
||||
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
----------------------------------------------
|
||||
-- Re-creating views from final parquet tables
|
||||
---------------------------------------------
|
||||
|
||||
-- Result
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result AS SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence AS access_mode FROM ${stats_db_name}.otherresearchproduct;
|
||||
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
-- To see with Antonis if the following is needed and where it should be placed
|
||||
-------------------------------------------------------------------------------
|
||||
CREATE TABLE ${stats_db_name}.numbers_country AS SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications FROM ${stats_db_name}.result r, ${stats_db_name}.result_datasources rd, ${stats_db_name}.datasource d, ${stats_db_name}.datasource_organizations dor, ${stats_db_name}.organization org WHERE r.id=rd.id AND rd.datasource=d.id AND d.id=dor.id AND dor.organization=org.id AND r.type='publication' AND r.bestlicence='Open Access' GROUP BY org.country;
|
|
@ -0,0 +1,6 @@
|
|||
----------------------------
|
||||
-- Post processing - Updates
|
||||
----------------------------
|
||||
|
||||
--Datasource temporary table updates
|
||||
UPDATE ${stats_db_name}.datasource_tmp set harvested ='true' WHERE datasource_tmp.id IN (SELECT DISTINCT d.id FROM ${stats_db_name}.datasource_tmp d, ${stats_db_name}.result_datasources rd where d.id=rd.datasource);
|
|
@ -0,0 +1,2 @@
|
|||
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
|
||||
UPDATE ${stats_db_name}.project_tmp SET haspubs='yes' WHERE project_tmp.id IN (SELECT pr.id FROM ${stats_db_name}.project_results pr, ${stats_db_name}.result r WHERE pr.result=r.id AND r.type='publication');
|
|
@ -0,0 +1,20 @@
|
|||
DROP TABLE IF EXISTS ${stats_db_name}.project;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.project stored as parquet as
|
||||
SELECT p.id , p.acronym, p.title, p.funder, p.funding_lvl0, p.funding_lvl1, p.funding_lvl2, p.ec39, p.type, p.startdate, p.enddate, p.start_year, p.end_year, p.duration,
|
||||
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END as haspubs,
|
||||
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END as numpubs,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END as daysforlastpub,
|
||||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END as delayedpubs,
|
||||
p.callidentifier, p.code
|
||||
FROM ${stats_db_name}.project_tmp p
|
||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||
FROM ${stats_db_name}.project_results pr INNER JOIN ${stats_db_name}.result r ON pr.result=r.id
|
||||
WHERE r.type='publication'
|
||||
GROUP BY pr.id) AS prr1 on prr1.id = p.id
|
||||
LEFT JOIN (SELECT pp.id, max(datediff(to_date(r.date), to_date(pp.enddate)) ) as daysForlastPub , count(distinct r.id) as dp
|
||||
FROM ${stats_db_name}.project_tmp pp, ${stats_db_name}.project_results pr, ${stats_db_name}.result r
|
||||
WHERE pp.id=pr.id AND pr.result=r.id AND r.type='publication' AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
|
||||
GROUP BY pp.id) AS prr2
|
||||
on prr2.id = p.id;
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
-- Publication temporary table updates
|
||||
UPDATE ${stats_db_name}.publication_tmp SET delayed = 'yes' WHERE publication_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
|
@ -0,0 +1,2 @@
|
|||
-- Dataset temporary table updates
|
||||
UPDATE ${stats_db_name}.dataset_tmp SET delayed = 'yes' WHERE dataset_tmp.id IN (SELECT distinct r.id FROM stats_wf_db_obs.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
|
@ -0,0 +1,2 @@
|
|||
-- Software temporary table updates
|
||||
UPDATE ${stats_db_name}.software_tmp SET delayed = 'yes' WHERE software_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
|
@ -0,0 +1,2 @@
|
|||
-- Oherresearchproduct temporary table updates
|
||||
UPDATE ${stats_db_name}.otherresearchproduct_tmp SET delayed = 'yes' WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id FROM ${stats_db_name}.result r, ${stats_db_name}.project_results pr, ${stats_db_name}.project_tmp p WHERE r.id=pr.result AND pr.id=p.id AND to_date(r.date)-to_date(p.enddate) > 0);
|
|
@ -0,0 +1 @@
|
|||
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS SELECT result_projects.id AS result, result_projects.project AS project_results, result.date as resultdate, project.enddate as projectenddate, result_projects.daysfromend as daysfromend FROM ${stats_db_name}.result_projects, ${stats_db_name}.result, ${stats_db_name}.project WHERE result_projects.id=result.id and result.type='publication' and project.id=result_projects.project;
|
|
@ -1,59 +1,26 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
--
|
||||
-- Sources related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
LEFT OUTER JOIN
|
||||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
------------------------------------------------------------------------------------------------------
|
||||
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
|
||||
------------------------------------------------------------------------------------------------------
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource;
|
||||
CREATE TABLE ${stats_db_name}.datasource stored as parquet as select * from ${stats_db_name}.datasource_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication;
|
||||
CREATE TABLE ${stats_db_name}.publication stored as parquet as select * from ${stats_db_name}.publication_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset;
|
||||
CREATE TABLE ${stats_db_name}.dataset stored as parquet as select * from ${stats_db_name}.dataset_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software;
|
||||
CREATE TABLE ${stats_db_name}.software stored as parquet as select * from ${stats_db_name}.software_tmp;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct;
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct stored as parquet as select * from ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
DROP TABLE ${stats_db_name}.project_tmp;
|
||||
DROP TABLE ${stats_db_name}.datasource_tmp;
|
||||
DROP TABLE ${stats_db_name}.publication_tmp;
|
||||
DROP TABLE ${stats_db_name}.dataset_tmp;
|
||||
DROP TABLE ${stats_db_name}.software_tmp;
|
||||
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
LEFT OUTER JOIN
|
||||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
LEFT OUTER JOIN
|
||||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as
|
||||
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
|
||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
|
||||
LEFT OUTER JOIN
|
||||
(
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
|
||||
SELECT * FROM ${stats_db_name}.publication_sources
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.dataset_sources
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_sources
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
|
|
|
@ -1,49 +1,7 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
--
|
||||
-- Licences related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
|
||||
----------------------------------------------
|
||||
-- Re-creating views from final parquet tables
|
||||
---------------------------------------------
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
|
||||
-- Result
|
||||
CREATE OR REPLACE VIEW ${stats_db_name}.result as SELECT *, bestlicence as access_mode FROM ${stats_db_name}.publication UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.software UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.dataset UNION ALL SELECT *, bestlicence as access_mode FROM ${stats_db_name}.otherresearchproduct;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS
|
||||
SELECT substr(p.id, 4) as id, licenses.value as type
|
||||
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
|
||||
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
|
||||
SELECT * FROM ${stats_db_name}.publication_licenses
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.dataset_licenses
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.software_licenses
|
||||
UNION ALL
|
||||
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids AS
|
||||
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources as
|
||||
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource
|
||||
FROM (
|
||||
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
|
||||
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
|
||||
LEFT OUTER JOIN (
|
||||
SELECT substr(d.id, 4) id
|
||||
from ${openaire_db_name}.datasource d
|
||||
WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id;
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
--
|
||||
-- Refereed related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as
|
||||
select substr(r.id, 4) as id, inst.refereed.classname as refereed
|
||||
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
|
||||
where r.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
|
||||
select * from ${stats_db_name}.publication_refereed
|
||||
union all
|
||||
select * from ${stats_db_name}.dataset_refereed
|
||||
union all
|
||||
select * from ${stats_db_name}.software_refereed
|
||||
union all
|
||||
select * from ${stats_db_name}.otherresearchproduct_refereed;
|
|
@ -1,80 +0,0 @@
|
|||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
||||
-- Peer reviewed:
|
||||
-- Results that have been collected from Crossref
|
||||
create table ${stats_db_name}.result_peerreviewed as
|
||||
with peer_reviewed as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_sources rs on rs.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rs.datasource
|
||||
where d.name='Crossref')
|
||||
select distinct peer_reviewed.id as id, true as peer_reviewed
|
||||
from peer_reviewed
|
||||
union all
|
||||
select distinct r.id as id, false as peer_reviewed
|
||||
from ${stats_db_name}.result r
|
||||
left outer join peer_reviewed pr on pr.id=r.id
|
||||
where pr.id is null;
|
||||
|
||||
-- Green OA:
|
||||
-- OA results that are hosted by an Institutional repository and have NOT been harvested from a DOAJ journal.
|
||||
create table ${stats_db_name}.result_greenoa as
|
||||
with result_green as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
left outer join (
|
||||
select rd.id from ${stats_db_name}.result_datasources rd
|
||||
join ${stats_db_name}.datasource d on rd.datasource=d.id
|
||||
join ${stats_db_name}.datasource_sources sds on sds.id=d.id
|
||||
join ${stats_db_name}.datasource sd on sd.id=sds.datasource
|
||||
where sd.name='DOAJ-ARTICLES'
|
||||
) as doaj on doaj.id=r.id
|
||||
where r.bestlicence in ('Open Access', 'Open Source') and d.type='Institutional Repository' and doaj.id is null)
|
||||
select distinct result_green.id, true as green
|
||||
from result_green
|
||||
union all
|
||||
select distinct r.id as id, false as green
|
||||
from ${stats_db_name}.result r
|
||||
left outer join result_green rg on rg.id=r.id
|
||||
where rg.id is null;
|
||||
|
||||
-- GOLD OA:
|
||||
-- OA results that have been harvested from a DOAJ journal.
|
||||
create table ${stats_db_name}.result_gold as
|
||||
with result_gold as (
|
||||
select distinct r.id as id
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
join ${stats_db_name}.datasource_sources sds on sds.id=d.id
|
||||
join ${stats_db_name}.datasource sd on sd.id=sds.datasource
|
||||
where r.type='publication' and r.bestlicence='Open Access' and sd.name='DOAJ-Articles')
|
||||
select distinct result_gold.id, true as gold
|
||||
from result_gold
|
||||
union all
|
||||
select distinct r.id, false as gold
|
||||
from ${stats_db_name}.result r
|
||||
where r.id not in (select id from result_gold);
|
||||
|
||||
-- shortcut result-country through the organization affiliation
|
||||
create table ${stats_db_name}.result_affiliated_country as
|
||||
select r.id as id, o.country as country
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_organization ro on ro.id=r.id
|
||||
join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
where o.country is not null and o.country!='';
|
||||
|
||||
-- shortcut result-country through datasource of deposition
|
||||
create table ${stats_db_name}.result_deposited_country as
|
||||
select r.id as id, o.country as country
|
||||
from ${stats_db_name}.result r
|
||||
join ${stats_db_name}.result_datasources rd on rd.id=r.id
|
||||
join ${stats_db_name}.datasource d on d.id=rd.datasource
|
||||
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
|
||||
join ${stats_db_name}.organization o on o.id=dor.organization
|
||||
where o.country is not null and o.country!='';
|
|
@ -1,55 +0,0 @@
|
|||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
CREATE TABLE ${stats_db_name}.result_tmp (
|
||||
id STRING,
|
||||
title STRING,
|
||||
publisher STRING,
|
||||
journal STRING,
|
||||
`date` STRING,
|
||||
`year` INT,
|
||||
bestlicence STRING,
|
||||
access_mode STRING,
|
||||
embargo_end_date STRING,
|
||||
delayed BOOLEAN,
|
||||
authors INT,
|
||||
source STRING,
|
||||
abstract BOOLEAN,
|
||||
type STRING ,
|
||||
peer_reviewed BOOLEAN,
|
||||
green BOOLEAN,
|
||||
gold BOOLEAN)
|
||||
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.publication r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.dataset r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.software r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
insert into ${stats_db_name}.result_tmp
|
||||
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
|
||||
FROM ${stats_db_name}.otherresearchproduct r
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
|
||||
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
|
||||
|
||||
drop table if exists ${stats_db_name}.result;
|
||||
drop view if exists ${stats_db_name}.result;
|
||||
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
|
||||
drop table ${stats_db_name}.result_tmp;
|
|
@ -1,163 +0,0 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Shadow schema table exchange
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
-- Dropping old views
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.country;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.countrygdp;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.dataset_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_organizations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.datasource_results;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.fundref;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.numbers_country;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.organization_projects;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.project;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_organizations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.project_results;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.publication_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_organization;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_projects;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.result_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.rndexpediture;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.roarmap;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_shadow_name}.software_topics;
|
||||
|
||||
|
||||
-- Creating the shadow database, in case it doesn't exist
|
||||
CREATE database IF NOT EXISTS ${stats_db_shadow_name};
|
||||
|
||||
-- Creating new views
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.country AS SELECT * FROM ${stats_db_name}.country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project AS SELECT * FROM ${stats_db_name}.project;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result AS SELECT * FROM ${stats_db_name}.result;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software AS SELECT * FROM ${stats_db_name}.software;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_shadow_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
|
|
@ -1,81 +0,0 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Impala table statistics - Needed to make the tables
|
||||
-- visible for impala
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
COMPUTE STATS country;
|
||||
COMPUTE STATS countrygdp;
|
||||
COMPUTE STATS dataset;
|
||||
COMPUTE STATS dataset_citations;
|
||||
COMPUTE STATS dataset_classifications;
|
||||
COMPUTE STATS dataset_concepts;
|
||||
COMPUTE STATS dataset_datasources;
|
||||
COMPUTE STATS dataset_languages;
|
||||
COMPUTE STATS dataset_oids;
|
||||
COMPUTE STATS dataset_pids;
|
||||
COMPUTE STATS dataset_sources;
|
||||
COMPUTE STATS dataset_topics;
|
||||
COMPUTE STATS datasource;
|
||||
COMPUTE STATS datasource_languages;
|
||||
COMPUTE STATS datasource_oids;
|
||||
COMPUTE STATS datasource_organizations;
|
||||
COMPUTE STATS datasource_results;
|
||||
COMPUTE STATS fundref;
|
||||
COMPUTE STATS numbers_country;
|
||||
COMPUTE STATS organization;
|
||||
COMPUTE STATS organization_datasources;
|
||||
COMPUTE STATS organization_projects;
|
||||
COMPUTE STATS otherresearchproduct;
|
||||
COMPUTE STATS otherresearchproduct_citations;
|
||||
COMPUTE STATS otherresearchproduct_classifications;
|
||||
COMPUTE STATS otherresearchproduct_concepts;
|
||||
COMPUTE STATS otherresearchproduct_datasources;
|
||||
COMPUTE STATS otherresearchproduct_languages;
|
||||
COMPUTE STATS otherresearchproduct_licenses;
|
||||
COMPUTE STATS otherresearchproduct_oids;
|
||||
COMPUTE STATS otherresearchproduct_pids;
|
||||
COMPUTE STATS otherresearchproduct_sources;
|
||||
COMPUTE STATS otherresearchproduct_topics;
|
||||
COMPUTE STATS project;
|
||||
COMPUTE STATS project_oids;
|
||||
COMPUTE STATS project_organizations;
|
||||
COMPUTE STATS project_results;
|
||||
COMPUTE STATS publication;
|
||||
COMPUTE STATS publication_citations;
|
||||
COMPUTE STATS publication_classifications;
|
||||
COMPUTE STATS publication_concepts;
|
||||
COMPUTE STATS publication_datasources;
|
||||
COMPUTE STATS publication_languages;
|
||||
COMPUTE STATS publication_licenses;
|
||||
COMPUTE STATS publication_oids;
|
||||
COMPUTE STATS publication_pids;
|
||||
COMPUTE STATS publication_sources;
|
||||
COMPUTE STATS publication_topics;
|
||||
COMPUTE STATS result;
|
||||
COMPUTE STATS result_citations;
|
||||
COMPUTE STATS result_classifications;
|
||||
COMPUTE STATS result_concepts;
|
||||
COMPUTE STATS result_datasources;
|
||||
COMPUTE STATS result_languages;
|
||||
COMPUTE STATS result_licenses;
|
||||
COMPUTE STATS result_oids;
|
||||
COMPUTE STATS result_organization;
|
||||
COMPUTE STATS result_pids;
|
||||
COMPUTE STATS result_projects;
|
||||
COMPUTE STATS result_sources;
|
||||
COMPUTE STATS result_topics;
|
||||
COMPUTE STATS rndexpediture;
|
||||
COMPUTE STATS roarmap;
|
||||
COMPUTE STATS software;
|
||||
COMPUTE STATS software_citations;
|
||||
COMPUTE STATS software_classifications;
|
||||
COMPUTE STATS software_concepts;
|
||||
COMPUTE STATS software_datasources;
|
||||
COMPUTE STATS software_languages;
|
||||
COMPUTE STATS software_licenses;
|
||||
COMPUTE STATS software_oids;
|
||||
COMPUTE STATS software_pids;
|
||||
COMPUTE STATS software_sources;
|
||||
COMPUTE STATS software_topics;
|
|
@ -1,35 +0,0 @@
|
|||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Publication table/view and Publication related tables/views
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
-- Publication temporary table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal ,
|
||||
p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
|
||||
p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
|
||||
case when size(p.description) > 0 then true else false end as abstract,
|
||||
'publication' as type
|
||||
from ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference=false;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language FROM ${openaire_db_name}.publication p;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids AS ids;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.pid) pids AS ppid;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_topics as select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic FROM ${openaire_db_name}.publication p LATERAL VIEW explode(p.subject) subjects AS subject;
|
||||
|
||||
-- Publication_citations
|
||||
CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
|
@ -0,0 +1,10 @@
|
|||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- 2. Publication table/view and Publication related tables/views
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
|
||||
-- Publication temporary table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.publication_tmp;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.publication_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
|
@ -0,0 +1,19 @@
|
|||
-- The following throws the following exception on CRN HUE Hive:
|
||||
-- Error while compiling statement: FAILED: SemanticException [Error 10011]: Line 2:34 Invalid function 'date_format'
|
||||
-- But runs OK on OCEAN HUE Hive
|
||||
|
||||
INSERT INTO ${stats_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal ,
|
||||
p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
|
||||
p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
|
||||
case when size(p.description) > 0 then true else false end as abstract,
|
||||
'publication' as type
|
||||
from ${openaire_db_name}.publication p
|
||||
where p.datainfo.deletedbyinference=false;
|
||||
|
||||
-- INSERT INTO ${hive_db_name}.publication_tmp SELECT substr(p.id, 4) as id, p.title[0].value as title, p.publisher.value as publisher, p.journal.name as journal,
|
||||
-- p.dateofacceptance.value as date, date_format(p.dateofacceptance.value,'yyyy') as year, p.bestaccessright.classname as bestlicence,
|
||||
-- p.embargoenddate.value as embargo_end_date, false as delayed, size(p.author) as authors , concat_ws('\u003B',p.source.value) as source,
|
||||
-- case when size(p.description) > 0 then true else false end as abstract,
|
||||
-- 'publication' as type
|
||||
-- from openaire.publication p
|
||||
-- where p.datainfo.deletedbyinference=false;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.publication_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.publication_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.publication_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN (SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.publication_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.publication p;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.publication_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.publication p LATERAL VIEW explode(p.originalid) oids as ids;
|
|
@ -0,0 +1 @@
|
|||
create table ${stats_db_name}.publication_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.publication p lateral view explode(p.pid) pids as ppid;
|
|
@ -0,0 +1 @@
|
|||
create table ${stats_db_name}.publication_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.publication p lateral view explode(p.subject) subjects as subject;
|
|
@ -1,36 +1,2 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Dataset table/view and Dataset related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
-- Dataset temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
|
||||
CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored AS orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.dataset_tmp SELECT substr(d.id, 4) AS id, d.title[0].value AS title, d.publisher.value AS publisher, cast(null AS string) AS journal,
|
||||
d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') AS year, d.bestaccessright.classname AS bestlicence,
|
||||
d.embargoenddate.value AS embargo_end_date, false AS delayed, size(d.author) AS authors , concat_ws('\u003B',d.source.value) AS source,
|
||||
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
|
||||
'dataset' AS type
|
||||
FROM ${openaire_db_name}.dataset d
|
||||
WHERE d.datainfo.deletedbyinference=FALSE;
|
||||
|
||||
-- Dataset_citations
|
||||
CREATE TABLE ${stats_db_name}.dataset_citations AS SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.dataset d LATERAL VIEW explode(d.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_datasources AS SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
|
||||
(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.dataset p;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids AS ids;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.pid) pids AS ppid;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.dataset_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.dataset p LATERAL VIEW explode(p.subject) subjects AS subject;
|
||||
-- 3. Publication_citations
|
||||
CREATE TABLE ${stats_db_name}.publication_citations AS SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS result FROM ${openaire_db_name}.publication p lateral view explode(p.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
--------------------------------------------------------
|
||||
--------------------------------------------------------
|
||||
-- Software table/view and Software related tables/views
|
||||
--------------------------------------------------------
|
||||
--------------------------------------------------------
|
||||
|
||||
-- Software temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
|
||||
CREATE TABLE ${stats_db_name}.software_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.software_tmp SELECT substr(s.id, 4) as id, s.title[0].value AS title, s.publisher.value AS publisher, CAST(NULL AS string) AS journal,
|
||||
s.dateofacceptance.value AS DATE, date_format(s.dateofacceptance.value,'yyyy') AS YEAR, s.bestaccessright.classname AS bestlicence,
|
||||
s.embargoenddate.value AS embargo_end_date, FALSE AS delayed, SIZE(s.author) AS authors , concat_ws('\u003B',s.source.value) AS source,
|
||||
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
||||
'software' as type
|
||||
from ${openaire_db_name}.software s
|
||||
where s.datainfo.deletedbyinference=false;
|
||||
|
||||
-- Software_citations
|
||||
CREATE TABLE ${stats_db_name}.software_citations AS SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.software s LATERAL VIEW explode(s.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.instance) instances AS instance) p LEFT OUTER JOIN
|
||||
(SELECT substr(d.id, 4) id FROM ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d ON p.datasource = d.id;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_languages AS select substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.software p;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.originalid) oids AS ids;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.pid) pids AS ppid;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.software_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.subject) subjects AS subject;
|
|
@ -0,0 +1,9 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- 4. Dataset table/view and Dataset related tables/views
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
-- Dataset temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.dataset_tmp;
|
||||
CREATE TABLE ${stats_db_name}.dataset_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
|
@ -0,0 +1 @@
|
|||
create table ${stats_db_name}.dataset_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.dataset p lateral view explode(p.subject) subjects as subject;
|
|
@ -0,0 +1,7 @@
|
|||
INSERT INTO ${stats_db_name}.dataset_tmp select substr(d.id, 4) as id, d.title[0].value as title, d.publisher.value as publisher, cast(null as string) as journal,
|
||||
d.dateofacceptance.value as date, date_format(d.dateofacceptance.value,'yyyy') as year, d.bestaccessright.classname as bestlicence,
|
||||
d.embargoenddate.value as embargo_end_date, false as delayed, size(d.author) as authors , concat_ws('\u003B',d.source.value) as source,
|
||||
case when size(d.description) > 0 then true else false end as abstract,
|
||||
'dataset' as type
|
||||
from ${openaire_db_name}.dataset d
|
||||
where d.datainfo.deletedbyinference=false;
|
|
@ -0,0 +1,2 @@
|
|||
-- Dataset_citations
|
||||
Create table ${stats_db_name}.dataset_citations as select substr(d.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") as result from ${openaire_db_name}.dataset d lateral view explode(d.extrainfo) citations as citation where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.dataset_classifications AS SELECT substr(p.id, 4) as id, instancetype.classname as type from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.instancetype) instances as instancetype;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.dataset_concepts AS SELECT substr(p.id, 4) as id, contexts.context.id as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context;
|
|
@ -0,0 +1,3 @@
|
|||
CREATE TABLE ${stats_db_name}.dataset_datasources as SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource FROM (SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
|
||||
from ${openaire_db_name}.dataset p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
|
||||
(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.dataset_languages AS select substr(p.id, 4) as id, p.language.classname as language from ${openaire_db_name}.dataset p;
|
|
@ -0,0 +1 @@
|
|||
CREATE TABLE ${stats_db_name}.dataset_oids AS SELECT substr(p.id, 4) as id, oids.ids as oid from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.originalid) oids as ids;
|
|
@ -0,0 +1 @@
|
|||
create table ${stats_db_name}.dataset_pids as select substr(p.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid from ${openaire_db_name}.dataset p lateral view explode(p.pid) pids as ppid;
|
|
@ -1,37 +0,0 @@
|
|||
--------------------------------------------------------------------------------
|
||||
--------------------------------------------------------------------------------
|
||||
-- Otherresearchproduct table/view and Otherresearchproduct related tables/views
|
||||
--------------------------------------------------------------------------------
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
-- Otherresearchproduct temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_tmp;
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_tmp ( id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties('transactional'='true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.otherresearchproduct_tmp SELECT substr(o.id, 4) AS id, o.title[0].value AS title, o.publisher.value AS publisher, CAST(NULL AS string) AS journal,
|
||||
o.dateofacceptance.value AS DATE, date_format(o.dateofacceptance.value,'yyyy') AS year, o.bestaccessright.classname AS bestlicence,
|
||||
o.embargoenddate.value as embargo_end_date, FALSE AS delayed, SIZE(o.author) AS authors , concat_ws('\u003B',o.source.value) AS source,
|
||||
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
|
||||
'other' AS type
|
||||
FROM ${openaire_db_name}.otherresearchproduct o
|
||||
WHERE o.datainfo.deletedbyinference=FALSE;
|
||||
|
||||
-- Otherresearchproduct_citations
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS RESULT FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") !="";
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS SELECT substr(p.id, 4) AS id, instancetype.classname AS type FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS SELECT substr(p.id, 4) AS id, contexts.context.id AS concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context;
|
||||
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
|
||||
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance) p LEFT OUTER JOIN
|
||||
(SELECT substr(d.id, 4) id from ${openaire_db_name}.datasource d WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS SELECT substr(p.id, 4) AS id, p.language.classname AS language FROM ${openaire_db_name}.otherresearchproduct p;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS SELECT substr(p.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid;
|
||||
|
||||
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject;
|
|
@ -0,0 +1,9 @@
|
|||
--------------------------------------------------------
|
||||
--------------------------------------------------------
|
||||
-- 5. Software table/view and Software related tables/views
|
||||
--------------------------------------------------------
|
||||
--------------------------------------------------------
|
||||
|
||||
-- Software temporary table supporting updates
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.software_tmp;
|
||||
CREATE TABLE ${stats_db_name}.software_tmp (id STRING, title STRING, publisher STRING, journal STRING, date STRING, year STRING, bestlicence STRING, embargo_end_date STRING, delayed BOOLEAN, authors INT, source STRING, abstract BOOLEAN, type STRING ) clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
|
|
@ -0,0 +1 @@
|
|||
create table ${stats_db_name}.software_topics as select substr(p.id, 4) as id, subjects.subject.qualifier.classname as type, subjects.subject.value as topic from ${openaire_db_name}.software p lateral view explode(p.subject) subjects as subject;
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue