mergin with branch beta

This commit is contained in:
Miriam Baglioni 2022-02-15 11:01:14 +01:00
commit 91d3a47110
50 changed files with 881 additions and 132 deletions

View File

@ -185,6 +185,22 @@ class OafMapperUtilsTest {
.getClassid());
}
@Test
void testDelegatedAuthority() throws IOException {
Dataset d1 = read("dataset_2.json", Dataset.class);
Dataset d2 = read("dataset_delegated.json", Dataset.class);
assertEquals(1, d2.getCollectedfrom().size());
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
Result res = OafMapperUtils.mergeResults(d1, d2);
assertEquals(d2, res);
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}

View File

@ -1 +1,140 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -0,0 +1,140 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

View File

@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.read()
.textFile(inputPath + "/*")
.map(
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
Encoders.bean(COCI.class))
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.toJavaRDD()
@ -98,26 +103,29 @@ public class CreateActionSetSparkJob implements Serializable {
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
private static List<Relation> createRelation(COCI value, boolean duplicate) {
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
String citing = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
final String cited = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
relationList
.addAll(
getRelations(
citing,
cited));
if(!citing.equals(cited)){
relationList
.addAll(
getRelations(
citing,
cited));
if (duplicate && line[1].endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
if (duplicate && value.getCiting().endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(
CleaningFunctions
.normalizePidValue("doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
}
}
return relationList;

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class ReadCOCI implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
ReadCOCI.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
SparkConf sconf = new SparkConf();
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
runWithSparkSession(
sconf,
isSparkSessionManaged,
spark -> {
doRead(
spark,
workingPath,
inputFile,
outputPath,
delimiter);
});
}
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
String outputPath,
String delimiter) throws IOException {
for(String inputFile : inputFiles){
String p_string = workingPath + "/" + inputFile + ".gz";
Dataset<Row> cociData = spark
.read()
.format("csv")
.option("sep", delimiter)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(p_string)
.repartition(100);
cociData.map((MapFunction<Row, COCI>) row -> {
COCI coci = new COCI();
coci.setOci(row.getString(0));
coci.setCiting(row.getString(1));
coci.setCited(row.getString(2));
return coci;
}, Encoders.bean(COCI.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + inputFile);
}
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.actionmanager.opencitations.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class COCI implements Serializable {
private String oci;
private String citing;
private String cited;
public String getOci() {
return oci;
}
public void setOci(String oci) {
this.oci = oci;
}
public String getCiting() {
return citing;
}
public void setCiting(String citing) {
this.citing = citing;
}
public String getCited() {
return cited;
}
public void setCited(String cited) {
this.cited = cited;
}
}

View File

@ -0,0 +1,37 @@
[
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "d",
"paramLongName": "delimiter",
"paramDescription": "the hdfs name node",
"paramRequired": false
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the hdfs name node",
"paramRequired": true
},
{
"paramName": "if",
"paramLongName": "inputFile",
"paramDescription": "the hdfs name node",
"paramRequired": true
}
]

View File

@ -26,6 +26,7 @@
<switch>
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
<case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
@ -60,6 +61,32 @@
<arg>--inputFile</arg><arg>${inputFile}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</java>
<ok to="read"/>
<error to="Kill"/>
</action>
<action name="read">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the AS for OC</name>
<class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
</spark>
<ok to="create_actionset"/>
<error to="Kill"/>
</action>
@ -81,7 +108,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -76,7 +76,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -99,7 +99,7 @@ public class CreateOpenCitationsASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(60, tmp.count());
assertEquals(62, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
@ -110,7 +110,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -131,7 +131,7 @@ public class CreateOpenCitationsASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(44, tmp.count());
assertEquals(46, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
@ -142,7 +142,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -175,7 +175,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -215,7 +215,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -240,8 +240,8 @@ public class CreateOpenCitationsASTest {
assertEquals("citation", r.getSubRelType());
assertEquals("resultResult", r.getRelType());
});
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
}
@ -250,7 +250,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob
@ -295,7 +295,7 @@ public class CreateOpenCitationsASTest {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
.getPath();
CreateActionSetSparkJob

View File

@ -0,0 +1,140 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ReadCOCITest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(ReadCOCITest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(ReadCOCITest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ReadCOCITest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(ReadCOCITest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testReadCOCI() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
ReadCOCI
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-workingPath",
workingDir.toString() + "/COCI",
"-outputPath",
workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4;input5"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<COCI> tmp = sc
.textFile(workingDir.toString() + "/COCI_json/*/")
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
Assertions.assertEquals(24, tmp.count());
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());
Assertions.assertEquals(8, tmp.filter(c -> c.getCiting().indexOf(".refs") > -1).count());
}
}

View File

@ -1,8 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
02001000007362801000805046300010563030608046333-0200101010136193701050501630209010637020000083700020400083733,10.1007/s10854-015-3684-x,10.1111/j.1551-2916.2008.02408.x,2015-09-01,P7Y2M,no,no
02001000007362801000805046300010563030608046333-02001000007362801000805046300010463020101046309,10.1007/s10854-015-3684-x,10.1007/s10854-014-2114-9,2015-09-01,P1Y2M4D,yes,no
02001000007362801000805046300010563030608046333-020010001063619371214271022182329370200010337000937000609,10.1007/s10854-015-3684-x,10.1016/j.ceramint.2013.09.069,2015-09-01,P1Y6M,no,no
02001000007362801000805046300010563030608046333-02001000007362801000805046300000963090901036304,10.1007/s10854-015-3684-x,10.1007/s10854-009-9913-4,2015-09-01,P6Y3M10D,yes,no
02001000007362801000805046300010563030608046333-02001000106360000030863010009085807025909000307006305,10.1007/s10854-015-3684-x,10.1016/0038-1098(72)90370-5,2015-09-01,P43Y8M,no,no
02001000007362801000805046300010563030608056309-02001000106361937281010370200010437000937000308,10.1007/s10854-015-3685-9,10.1016/j.saa.2014.09.038,2015-09-03,P0Y7M,no,no
02001000007362801000805046300010563030608056309-0200100010636193722102912171027370200010537000437000106,10.1007/s10854-015-3685-9,10.1016/j.matchar.2015.04.016,2015-09-03,P0Y2M,no,no

View File

@ -1,8 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
02001000308362804010509076300010963000003086301-0200100020936020001003227000009010004,10.1038/s41597-019-0038-1,10.1029/2010wr009104,2019-04-15,P8Y1M,no,no
02001000308362804010509076300010963000003086301-0200100010636280103060463080105025800015900000006006303,10.1038/s41597-019-0038-1,10.1016/s1364-8152(01)00060-3,2019-04-15,P17Y3M,no,no
02001000308362804010509076300010963000003086301-02001000007362800000407076300010063000401066333,10.1038/s41597-019-0038-1,10.1007/s00477-010-0416-x,2019-04-15,P8Y9M6D,no,no
02001000308362804010509076300010963000003086301-02001000007362800000700046300010363000905016308,10.1038/s41597-019-0038-1,10.1007/s00704-013-0951-8,2019-04-15,P5Y9M23D,no,no
02001000308362804010509076300010963000003086301-02001000002361924123705070707,10.1038/s41597-019-0038-1,10.1002/joc.5777,2019-04-15,P0Y8M1D,no,no
02001000308362804010509076300010963000003086301-02005010904361714282863020263040504076302000108,10.1038/s41597-019-0038-1,10.5194/hess-22-4547-2018,2019-04-15,P0Y7M18D,no,no
02001000308362804010509076300010963000003086301-02001000002361924123703050404,10.1038/s41597-019-0038-1,10.1002/joc.3544,2019-04-15,P6Y9M6D,no,no

View File

@ -1,9 +0,0 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
0200100000236090708010101090307000202023727141528-020050302063600040000010307,10.1002/9781119370222.refs,10.5326/0400137,2020-06-22,P16Y3M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020000073700000301093733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2007.00319.x,2020-06-22,P12Y8M,no,no
0200100000236090708010101090307000202023727141528-0200101010136312830370102030509,10.1002/9781119370222.refs,10.1111/vsu.12359,2020-06-22,P4Y10M29D,no,no
0200100000236090708010101090307000202023727141528-020050302063600030900020904,10.1002/9781119370222.refs,10.5326/0390294,2020-06-22,P17Y1M,no,no
0200100000236090708010101090307000202023727141528-020050302063600040200030701,10.1002/9781119370222.refs,10.5326/0420371,2020-06-22,P13Y9M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020001033701020000003733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2013.12000.x,2020-06-22,P7Y2M,no,no
0200100000236090708010101090307000202023727141528-020010008003600000408000106093702000006370306070200,10.1002/9781119370222.refs,10.1080/00480169.2006.36720,2020-06-22,P13Y6M,no,no
0200100000236090708010101090307000202023727141528-0200101010136193701070501630008010337020000063700000003033733,10.1002/9781119370222.refs,10.1111/j.1751-0813.2006.00033.x,2020-06-22,P13Y8M,no,no

View File

@ -102,7 +102,8 @@ public class ResultTagger implements Serializable {
// .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
// .map(s -> StringUtils.substringAfter(s, "|"))
// .collect(Collectors.toCollection(HashSet::new))
tmp.forEach(
tmp
.forEach(
dsId -> datasources
.addAll(
conf.getCommunityForDatasource(dsId, param)));

View File

@ -347,6 +347,10 @@ public abstract class AbstractMdRecordToOafMapper {
r.setCoverage(prepareCoverages(doc, info));
r.setContext(prepareContexts(doc, info));
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
r
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
r.setInstance(instances);
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));

View File

@ -814,6 +814,27 @@ class MappersTest {
}
}
@Test
void testOpenAPC() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_openapc.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, true, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertTrue(p.getInstance().size() > 0);
assertEquals("https://doi.org/10.1155/2015/439379", p.getInstance().get(0).getUrl().get(0));
assertTrue(p.getProcessingchargeamount() != null);
assertTrue(p.getProcessingchargecurrency() != null);
assertEquals("1721.47", p.getProcessingchargeamount().getValue());
assertEquals("EUR", p.getProcessingchargecurrency().getValue());
}
private void assertValidId(final String id) {
// System.out.println(id);

View File

@ -0,0 +1,45 @@
<?xml version="1.0" encoding="UTF-8"?>
<oai:record xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:date="http://exslt.org/dates-and-times"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<oai:header>
<dri:objIdentifier>openapc_____::000023f9cb6e3a247c764daec4273cbc</dri:objIdentifier>
<dri:recordIdentifier>10.1155/2015/439379</dri:recordIdentifier>
<dri:dateOfCollection>2022-02-01T15:26:33.817Z</dri:dateOfCollection>
<oaf:datasourceprefix>openapc_____</oaf:datasourceprefix>
<dr:dateOfTransformation>2022-02-02T15:45:32.502Z</dr:dateOfTransformation>
</oai:header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:identifier>https://doi.org/10.1155/2015/439379</dc:identifier>
<oaf:identifier identifierType="doi">10.1155/2015/439379</oaf:identifier>
<oaf:identifier identifierType="pmcid">PMC4354964</oaf:identifier>
<oaf:identifier identifierType="pmid">25811027.0</oaf:identifier>
<datacite:affiliation affiliationIdentifier="grid.83440.3b"
affiliationIdentifierScheme="GRID" schemeURI="https://www.grid.ac/">UCL</datacite:affiliation>
<datacite:affiliation
affiliationIdentifier="https://ror.org/02jx3x895" affiliationIdentifierScheme="ROR">UCL</datacite:affiliation>
<oaf:processingchargeamount currency="EUR">1721.47</oaf:processingchargeamount>
<oaf:journal issn="2314-6133">BioMed Research International</oaf:journal>
<dc:license>http://creativecommons.org/licenses/by/3.0/</dc:license>
<dc:date>2015</dc:date>
<dr:CobjCategory type="publication">0004</dr:CobjCategory>
<oaf:accessrights>OPEN</oaf:accessrights>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
<oaf:hostedBy id="apc_________::openapc" name="OpenAPC Global Initiative"/>
<oaf:collectedFrom id="apc_________::openapc" name="OpenAPC Global Initiative"/>
</metadata>
<oaf:about xmlns:oai="http://wwww.openarchives.org/OAI/2.0/">
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</oaf:about>
</oai:record>

View File

@ -398,6 +398,16 @@ public class XmlRecordFactory implements Serializable {
if (r.getResourcetype() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype()));
}
if (r.getProcessingchargeamount() != null) {
metadata
.add(
XmlSerializationUtils
.asXmlElement("processingchargeamount", r.getProcessingchargeamount().getValue()));
metadata
.add(
XmlSerializationUtils
.asXmlElement("processingchargecurrency", r.getProcessingchargecurrency().getValue()));
}
}
switch (type) {

View File

@ -66,6 +66,9 @@ public class XmlRecordFactoryTest {
assertEquals("10.5689/LIB.2018.2853550", doc.valueOf("//instance/alternateidentifier/text()"));
assertEquals(3, doc.selectNodes("//instance").size());
assertEquals("1721.47", doc.valueOf("//processingchargeamount/text()"));
assertEquals("EUR", doc.valueOf("//processingchargecurrency/text()"));
}
@Test

View File

@ -1655,5 +1655,37 @@
},
"value": "Understanding Electromigration in Cu-CNT Composite Interconnects A Multiscale Electrothermal Simulation Study"
}
]
],
"processingchargeamount": {
"value": "1721.47",
"dataInfo": {
"invisible": true,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": "",
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"processingchargecurrency": {
"value": "EUR",
"dataInfo": {
"invisible": true,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": "",
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
}

View File

@ -14,7 +14,7 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -25,7 +25,7 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -36,7 +36,7 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
@ -47,7 +47,7 @@ LEFT OUTER JOIN
(
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources
@ -76,8 +76,8 @@ join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false
and r2.datainfo.deletedbyinference=false
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false;
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;

View File

@ -8,22 +8,22 @@
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses AS
SELECT substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false;
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses
@ -46,7 +46,7 @@ FROM (
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false) d on o.datasource = d.id;
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id;
-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication_licenses COMPUTE STATISTICS FOR COLUMNS;

View File

@ -9,22 +9,22 @@
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed as
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false;
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed as
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false;
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed as
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false;
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed as
select substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false;
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE;
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed

View File

@ -38,13 +38,13 @@ SELECT substr(p.id, 4) as id,
case when size(p.description) > 0 then true else false end as abstract,
'publication' as type
from ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_classifications AS
SELECT substr(p.id, 4) as id, instancetype.classname as type
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.instance.instancetype) instances as instancetype
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_concepts AS
SELECT substr(p.id, 4) as id, case
@ -53,45 +53,45 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.publication p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_datasources as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference = false) p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.publication_languages AS
select substr(p.id, 4) as id, p.language.classname as language
FROM ${openaire_db_name}.publication p
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value as pid
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_topics as
select substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS TYPE, subjects.subject.value AS topic
FROM ${openaire_db_name}.publication p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.publication_citations AS
SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false;
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -81,7 +81,11 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics;
create table TARGET.result_result stored as parquet as select * from SOURCE.result_result orig where exists (select 1 from TARGET.result r where r.id=orig.source or r.id=orig.target);
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;
drop view TARGET.foo1;
drop view TARGET.foo2;
compute stats TARGET.result_result;
-- datasources
@ -126,7 +130,7 @@ compute stats TARGET.indi_result_has_cc_licence;
create table TARGET.indi_result_has_cc_licence_url stored as parquet as select * from SOURCE.indi_result_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_has_cc_licence_url;
create view TARGET.indi_funder_country_collab stored as select * from SOURCE.indi_funder_country_collab;
create view TARGET.indi_funder_country_collab stored as parquet as select * from SOURCE.indi_funder_country_collab;
create table TARGET.indi_result_with_orcid stored as parquet as select * from SOURCE.indi_result_with_orcid orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.indi_result_with_orcid;

View File

@ -38,20 +38,20 @@ SELECT substr(d.id, 4) AS id,
CASE WHEN SIZE(d.description) > 0 THEN TRUE ELSE FALSE end AS abstract,
'dataset' AS type
FROM ${openaire_db_name}.dataset d
WHERE d.datainfo.deletedbyinference = FALSE;
WHERE d.datainfo.deletedbyinference = FALSE and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_citations AS
SELECT substr(d.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.dataset d
LATERAL VIEW explode(d.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and d.datainfo.deletedbyinference = false;
and d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_concepts AS
SELECT substr(p.id, 4) as id, case
@ -60,7 +60,7 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
from ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.context) contexts as context
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_datasources AS
SELECT p.id, case when d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
@ -68,31 +68,31 @@ FROM (
SELECT substr(p.id, 4) as id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.instance) instances AS instance
where p.datainfo.deletedbyinference = false) p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.dataset_languages AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.dataset p
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.dataset_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.dataset p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -38,20 +38,20 @@ SELECT substr(s.id, 4) as id,
CASE WHEN SIZE(s.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'software' as type
from ${openaire_db_name}.software s
where s.datainfo.deletedbyinference = false;
where s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_citations AS
SELECT substr(s.id, 4) as id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.software s
LATERAL VIEW explode(s.extrainfo) citations as citation
where xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and s.datainfo.deletedbyinference = false;
and s.datainfo.deletedbyinference = false and s.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_concepts AS
SELECT substr(p.id, 4) as id, case
@ -60,7 +60,7 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_datasources AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource end as datasource
@ -68,31 +68,31 @@ FROM (
SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.instance) instances AS instance
where p.datainfo.deletedbyinference = false) p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN (
SELECT substr(d.id, 4) id
FROM ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d ON p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d ON p.datasource = d.id;
CREATE TABLE ${stats_db_name}.software_languages AS
select substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.software p
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.software_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.software p
LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -37,19 +37,19 @@ SELECT substr(o.id, 4) AS id,
CASE WHEN SIZE(o.description) > 0 THEN TRUE ELSE FALSE END AS abstract,
'other' AS type
FROM ${openaire_db_name}.otherresearchproduct o
WHERE o.datainfo.deletedbyinference = FALSE;
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible=false;
-- Otherresearchproduct_citations
CREATE TABLE ${stats_db_name}.otherresearchproduct_citations AS
SELECT substr(o.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type='openaire']/@value") AS cites
FROM ${openaire_db_name}.otherresearchproduct o LATERAL VIEW explode(o.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and o.datainfo.deletedbyinference = false;
and o.datainfo.deletedbyinference = false and o.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_classifications AS
SELECT substr(p.id, 4) AS id, instancetype.classname AS type
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.instancetype) instances AS instancetype
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS
SELECT substr(p.id, 4) as id, case
@ -57,33 +57,33 @@ SELECT substr(p.id, 4) as id, case
when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other')
when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_datasources AS
SELECT p.id, CASE WHEN d.id IS NULL THEN 'other' ELSE p.datasource END AS datasource
FROM (SELECT substr(p.id, 4) AS id, substr(instances.instance.hostedby.key, 4) AS datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.instance) instances as instance
where p.datainfo.deletedbyinference = false) p
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false) p
LEFT OUTER JOIN(SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference = false) d on p.datasource = d.id;
WHERE d.datainfo.deletedbyinference = false and d.datainfo.invisible=false) d on p.datasource = d.id;
CREATE TABLE ${stats_db_name}.otherresearchproduct_languages AS
SELECT substr(p.id, 4) AS id, p.language.classname AS language
FROM ${openaire_db_name}.otherresearchproduct p
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_pids AS
SELECT substr(p.id, 4) AS id, ppid.qualifier.classname AS type, ppid.value AS pid
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.pid) pids AS ppid
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.otherresearchproduct_topics AS
SELECT substr(p.id, 4) AS id, subjects.subject.qualifier.classname AS type, subjects.subject.value AS topic
FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.subject) subjects AS subject
where p.datainfo.deletedbyinference = false;
where p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -5,24 +5,26 @@
------------------------------------------------------
CREATE TABLE ${stats_db_name}.project_oids AS
SELECT substr(p.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids;
FROM ${openaire_db_name}.project p LATERAL VIEW explode(p.originalid) oids AS ids
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_organizations AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization'
and r.datainfo.deletedbyinference = false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_results AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject'
and r.datainfo.deletedbyinference = false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
create table ${stats_db_name}.project_classification as
select substr(p.id, 4) as id, class.h2020programme.code, class.level1, class.level2, class.level3
from ${openaire_db_name}.project p
lateral view explode(p.h2020classification) classifs as class
where p.datainfo.deletedbyinference=false and class.h2020programme is not null;
where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false and class.h2020programme is not null;
CREATE TABLE ${stats_db_name}.project_tmp
(
@ -72,7 +74,7 @@ SELECT substr(p.id, 4) AS id,
p.code.value AS code,
p.totalcost AS totalcost
FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false;
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
create table ${stats_db_name}.funder as
select distinct xpath_string(fund, '//funder/id') as id,

View File

@ -127,7 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization'
and r.datainfo.deletedbyinference = false;
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.result_projects AS
select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend, pr.provenance as provenance

View File

@ -44,7 +44,7 @@ FROM ${openaire_db_name}.datasource d1
LATERAL VIEW EXPLODE(originalid) temp AS originalidd
WHERE originalidd like "piwik:%") AS d2
ON d1.id = d2.id
WHERE d1.datainfo.deletedbyinference = FALSE;
WHERE d1.datainfo.deletedbyinference = FALSE and d1.datainfo.invisible=false;
-- Updating temporary table with everything that is not based on results -> This is done with the following "dual" table.
-- Creating a temporary dual table that will be removed after the following insert
@ -82,24 +82,25 @@ WHERE yearofvalidation = '-1';
CREATE TABLE ${stats_db_name}.datasource_languages AS
SELECT substr(d.id, 4) AS id, langs.languages AS language
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages;
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.odlanguages.value) langs AS languages
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.datasource_oids AS
SELECT substr(d.id, 4) AS id, oids.ids AS oid
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids;
FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids
where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.datasource_organizations AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization'
and r.datainfo.deletedbyinference = false;
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
-- datasource sources:
-- where the datasource info have been collected from.
create table if not exists ${stats_db_name}.datasource_sources AS
select substr(d.id, 4) as id, substr(cf.key, 4) as datasource
from ${openaire_db_name}.datasource d lateral view explode(d.collectedfrom) cfrom as cf
where d.datainfo.deletedbyinference = false;
where d.datainfo.deletedbyinference = false and d.datainfo.invisible=false;
CREATE OR REPLACE VIEW ${stats_db_name}.datasource_results AS
SELECT datasource AS id, id AS result

View File

@ -9,7 +9,7 @@ SELECT substr(o.id, 4) as id,
o.legalshortname.value as legalshortname,
o.country.classid as country
FROM ${openaire_db_name}.organization o
WHERE o.datainfo.deletedbyinference = FALSE;
WHERE o.datainfo.deletedbyinference = FALSE and o.datainfo.invisible = FALSE;
CREATE OR REPLACE VIEW ${stats_db_name}.organization_datasources AS
SELECT organization AS id, id AS datasource

View File

@ -44,7 +44,7 @@
<pluginRepository>
<id>iis-releases</id>
<name>iis releases plugin repository</name>
<url>http://maven.ceon.pl/artifactory/iis-releases</url>
<url>https://maven.ceon.pl/artifactory/iis-releases</url>
<layout>default</layout>
</pluginRepository>
</pluginRepositories>

View File

@ -797,7 +797,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.10.26]</dhp-schemas.version>
<dhp-schemas.version>[2.10.29]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>