Merge branch 'beta' into 7096-fileGZip-collector-plugin

This commit is contained in:
Claudio Atzori 2022-04-21 11:42:43 +02:00
commit eabb40fccc
31 changed files with 412 additions and 59 deletions

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-assembly-resources</artifactId> <artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId> <artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId> <artifactId>dhp-code-style</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>
@ -47,12 +47,16 @@
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version> <version>3.9.1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin> </plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
</build> </build>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path> <dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties> </properties>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<packaging>pom</packaging> <packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-actionmanager</artifactId> <artifactId>dhp-actionmanager</artifactId>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>
<build> <build>

View File

@ -72,7 +72,6 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
spark.read.load(targetPath).printSchema(); spark.read.load(targetPath).printSchema();
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf] val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result result

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,11 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId> <artifactId>dhp-dedup-openaire</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -446,16 +446,12 @@ case object Crossref2Oaf {
case "10.13039/501100000781" => case "10.13039/501100000781" =>
generateSimpleRelationFromAward(funder, "corda_______", extractECAward) generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward) generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" => case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
generateSimpleRelationFromAward(funder, "nsf_________", a => a) case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100001665" => case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" => case "10.13039/501100001602" =>
generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", "")) generateSimpleRelationFromAward(funder, "sfi_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" => case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038" => case "10.13039/501100000038" =>
val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("nserc_______", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
@ -468,14 +464,10 @@ case object Crossref2Oaf {
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/501100002848" => case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
generateSimpleRelationFromAward(funder, "conicytf____", a => a) case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100003448" => case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a => a)
generateSimpleRelationFromAward(funder, "gsrt________", extractECAward) case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100010198" =>
generateSimpleRelationFromAward(funder, "sgov________", a => a)
case "10.13039/501100004564" =>
generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" => case "10.13039/501100003407" =>
generateSimpleRelationFromAward(funder, "miur________", a => a) generateSimpleRelationFromAward(funder, "miur________", a => a)
val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("miur________", "1e5e62235d094afd01cd56e65112fc63")
@ -487,15 +479,11 @@ case object Crossref2Oaf {
"irb_hr______", "irb_hr______",
a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "") a => a.replaceAll("Project No.", "").replaceAll("HRZZ-", "")
) )
case "10.13039/501100006769" => case "10.13039/501100006769" => generateSimpleRelationFromAward(funder, "rsf_________", a => a)
generateSimpleRelationFromAward(funder, "rsf_________", a => a) case "10.13039/501100001711" => generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100001711" => case "10.13039/501100004410" => generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410" =>
generateSimpleRelationFromAward(funder, "tubitakf____", a => a)
case "10.10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
case "10.13039/100004440" => case "10.13039/100004440" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
@ -516,6 +504,7 @@ case object Crossref2Oaf {
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => case "CONICYT, Programa de Formación de Capital Humano Avanzado" =>
generateSimpleRelationFromAward(funder, "conicytf____", extractECAward) generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" => case "Wellcome Trust Masters Fellowship" =>
generateSimpleRelationFromAward(funder, "wt__________", a => a)
val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
@ -51,7 +51,7 @@
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>

View File

@ -274,7 +274,6 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark> </spark>
@ -304,7 +303,6 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark> </spark>
@ -334,7 +332,6 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark> </spark>
@ -364,7 +361,6 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
</spark> </spark>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -0,0 +1,113 @@
package eu.dnetlib.dhp.oa.graph.clean;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class CleanContextSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanContextSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String contextId = parser.get("contextId");
log.info("contextId: {}", contextId);
String verifyParam = parser.get("verifyParam");
log.info("verifyParam: {}", verifyParam);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath);
});
}
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam, String inputPath, Class<T> entityClazz, String workingPath) {
Dataset<T> res = spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz));
res.map((MapFunction<T, T>) r -> {
if(!r.getTitle()
.stream()
.filter(t -> t.getQualifier().getClassid()
.equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid()))
.anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))){
return r;
}
r.setContext(r.getContext().stream().filter(c -> !c.getId().split("::")[0]
.equalsIgnoreCase(contextId)).collect(Collectors.toList()));
return r;
} ,Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(workingPath);
spark.read().textFile(workingPath).map((MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(inputPath);
}
}

View File

@ -4,8 +4,6 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
public class BibJson implements Serializable { public class BibJson implements Serializable {
private Editorial editorial; private Editorial editorial;
private PidScheme pid_scheme; private PidScheme pid_scheme;

View File

@ -0,0 +1,37 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path to the graph data dump to read",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},
{
"paramName": "ci",
"paramLongName": "contextId",
"paramDescription": "the id of the context to be removed",
"paramRequired": true
},
{
"paramName": "class",
"paramLongName": "graphTableClassName",
"paramDescription": "class name moelling the graph table",
"paramRequired": true
},{
"paramName": "vf",
"paramLongName": "verifyParam",
"paramDescription": "the parameter to be verified to remove the context",
"paramRequired": true
}
]

View File

@ -0,0 +1,159 @@
package eu.dnetlib.dhp.oa.graph.clean;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.Instance;
import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute;
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Locale;
public class CleanContextTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testResultClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json")
.getPath();
final String prefix = "gcube ";
spark.read().textFile(sourcePath).map((MapFunction<String, Publication>) r -> OBJECT_MAPPER.readValue(r, Publication.class), Encoders.bean(Publication.class))
.write().json(workingDir.toString() + "/publication");
CleanContextSparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/publication",
"-graphTableClassName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString() + "/working",
"-contextId","sobigdata",
"-verifyParam","gCube "
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(7, tmp.count());
//original result with sobigdata context and gcube as starting string in the main title for the publication
Assertions.assertEquals(0,
tmp.filter(p->p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).collect().get(0).getContext().size());
//original result with sobigdata context without gcube as starting string in the main title for the publication
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).collect().get(0).getContext().size());
Assertions.assertEquals("sobigdata::projects::2",tmp.filter(p->p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).collect().get(0).getContext().get(0).getId() );
//original result with sobigdata context with gcube as starting string in the subtitle
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getContext().size());
Assertions.assertEquals("sobigdata::projects::2",tmp.filter(p->p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getContext().get(0).getId() );
List<StructuredProperty> titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix) );
Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid());
//original result with sobigdata context with gcube not as starting string in the main title
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getContext().size());
Assertions.assertEquals("sobigdata::projects::1",tmp.filter(p->p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getContext().get(0).getId() );
titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix) );
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()) );
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
//original result with sobigdata in context and also other contexts with gcube as starting string for the main title
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getContext().size());
Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getContext().get(0).getId() );
titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getTitle();
Assertions.assertEquals(1, titles.size());
Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix) );
Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid());
//original result with multiple main title one of which whith gcube as starting string and with 2 contextes
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getContext().size());
Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getContext().get(0).getId() );
titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getTitle();
Assertions.assertEquals(2, titles.size());
Assertions.assertTrue(titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith(prefix)) );
//original result without sobigdata in context with gcube as starting string for the main title
Assertions.assertEquals(1,
tmp.filter(p->p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getContext().size());
Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getContext().get(0).getId() );
titles = tmp.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getTitle();
Assertions.assertEquals(2, titles.size());
Assertions.assertTrue(titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith(prefix)));
}
}

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-promote</artifactId> <artifactId>dhp-stats-promote</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-update</artifactId> <artifactId>dhp-stats-update</artifactId>

View File

@ -12,6 +12,8 @@ and (ri.accessright = 'Open Access'
or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')) tmp
on p.id= tmp.id; on p.id= tmp.id;
compute stats indi_pub_green_oa;
create table indi_pub_grey_lit stored as parquet as create table indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select distinct p.id, coalesce(grey_lit, 0) as grey_lit
from publication p from publication p
@ -22,6 +24,8 @@ join result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id;
compute stats indi_pub_grey_lit;
create table indi_pub_doi_from_crossref stored as parquet as create table indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from publication p from publication p
@ -31,6 +35,7 @@ join datasource d on d.id = ri.collectedfrom
where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
on tmp.id=p.id; on tmp.id=p.id;
compute stats indi_pub_doi_from_crossref;
---- Sprint 2 ---- ---- Sprint 2 ----
create table indi_result_has_cc_licence stored as parquet as create table indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
@ -40,6 +45,8 @@ join result_licenses as license on license.id = r.id
where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp where lower(license.type) LIKE '%creativecommons.org%' OR lower(license.type) LIKE '%cc-%') tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_has_cc_licence;
create table indi_result_has_cc_licence_url stored as parquet as create table indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from result r from result r
@ -49,16 +56,21 @@ join result_licenses as license on license.id = r.id
WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp WHERE lower(parse_url(license.type, "HOST")) = "creativecommons.org") tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_has_cc_licence_url;
create table indi_pub_has_abstract stored as parquet as create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract select distinct publication.id, coalesce(abstract, 1) has_abstract
from publication; from publication;
compute stats indi_pub_has_abstract;
create table indi_result_with_orcid stored as parquet as create table indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select distinct r.id, coalesce(has_orcid, 0) as has_orcid
from result r from result r
left outer join (select id, 1 as has_orcid from result_orcid) tmp left outer join (select id, 1 as has_orcid from result_orcid) tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_result_with_orcid;
---- Sprint 3 ---- ---- Sprint 3 ----
create table indi_funded_result_with_fundref stored as parquet as create table indi_funded_result_with_fundref stored as parquet as
@ -68,6 +80,8 @@ left outer join (select distinct id, 1 as fundref from project_results
where provenance='Harvested') tmp where provenance='Harvested') tmp
on r.id= tmp.id; on r.id= tmp.id;
compute stats indi_funded_result_with_fundref;
create table indi_result_org_country_collab stored as parquet as create table indi_result_org_country_collab stored as parquet as
with tmp as with tmp as
(select o.id as id, o.country , ro.id as result,r.type from organization o (select o.id as id, o.country , ro.id as result,r.type from organization o
@ -79,6 +93,8 @@ join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country; group by o1.id, o1.type,o2.country;
compute stats indi_result_org_country_collab;
create table indi_result_org_collab stored as parquet as create table indi_result_org_collab stored as parquet as
with tmp as with tmp as
(select o.id, ro.id as result,r.type from organization o (select o.id, ro.id as result,r.type from organization o
@ -90,6 +106,8 @@ join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id where o1.id<>o2.id
group by o1.id, o2.id, o1.type; group by o1.id, o2.id, o1.type;
compute stats indi_result_org_collab;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
join organization o on o.id=op.id join organization o on o.id=op.id
@ -101,6 +119,8 @@ join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
group by f1.funder, f2.country, f1.country; group by f1.funder, f2.country, f1.country;
compute stats indi_funder_country_collab;
create table indi_result_country_collab stored as parquet as create table indi_result_country_collab stored as parquet as
with tmp as with tmp as
(select country, ro.id as result,r.type from organization o (select country, ro.id as result,r.type from organization o
@ -112,6 +132,8 @@ join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
group by o1.country, o2.country, o1.type; group by o1.country, o2.country, o1.type;
compute stats indi_result_country_collab;
---- Sprint 4 ---- ---- Sprint 4 ----
create table indi_pub_diamond stored as parquet as create table indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
@ -123,6 +145,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp and (ps.journal_is_in_doaj=true or ps.journal_is_oa=true) and ps.has_apc=false) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_diamond;
create table indi_pub_hybrid stored as parquet as create table indi_pub_hybrid stored as parquet as
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd from publication_datasources pd
@ -133,6 +157,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_hybrid;
create table indi_pub_in_transformative stored as parquet as create table indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from publication pd from publication pd
@ -143,6 +169,8 @@ join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_onli
and ps.is_transformative_journal=true) tmp and ps.is_transformative_journal=true) tmp
on pd.id=tmp.id; on pd.id=tmp.id;
compute stats indi_pub_in_transformative;
create table indi_pub_closed_other_open stored as parquet as create table indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open from result_instance ri
left outer join left outer join
@ -153,11 +181,12 @@ where d.type like '%Journal%' and ri.accessright='Closed Access' and
(p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp (p.bestlicence='Open Access' or p.bestlicence='Open Source')) tmp
on tmp.id=ri.id; on tmp.id=ri.id;
compute stats indi_pub_closed_other_open;
---- Sprint 5 ---- ---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id; select id, count(id) as number_of_copies from result_instance group by id;
compute stats indi_result_no_of_copies;
---- Sprint 6 ---- ---- Sprint 6 ----
create table indi_pub_gold_oa stored as parquet as create table indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( WITH gold_oa AS (
@ -183,6 +212,8 @@ LEFT OUTER JOIN (
JOIN issn on issn.id=pd.datasource JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_pub_gold_oa;
create table indi_datasets_gold_oa stored as parquet as create table indi_datasets_gold_oa stored as parquet as
WITH gold_oa AS ( WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
@ -210,6 +241,8 @@ LEFT OUTER JOIN (
JOIN issn on issn.id=pd.datasource JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_datasets_gold_oa;
create table indi_software_gold_oa stored as parquet as create table indi_software_gold_oa stored as parquet as
WITH gold_oa AS ( WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
@ -237,6 +270,8 @@ LEFT OUTER JOIN (
JOIN issn on issn.id=pd.datasource JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_software_gold_oa;
create table indi_org_findable stored as parquet as create table indi_org_findable stored as parquet as
with result_with_pid as ( with result_with_pid as (
select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
@ -263,6 +298,8 @@ join result_with_pid_share on result_with_pid_share.organization=allresults.orga
left outer join ( left outer join (
select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization; select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
compute stats indi_org_findable;
create table indi_org_openess stored as parquet as create table indi_org_openess stored as parquet as
WITH datasets_oa as ( WITH datasets_oa as (
SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
@ -313,6 +350,8 @@ left outer join (
left outer join ( left outer join (
select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization; select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
compute stats indi_org_openess;
create table indi_pub_hybrid_oa_with_cc stored as parquet as create table indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS ( WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
@ -343,6 +382,8 @@ LEFT OUTER JOIN (
JOIN indi_result_has_cc_licence cc on pd.id=cc.id JOIN indi_result_has_cc_licence cc on pd.id=cc.id
where cc.has_cc_license=1) tmp on pd.id=tmp.id; where cc.has_cc_license=1) tmp on pd.id=tmp.id;
compute stats indi_pub_hybrid_oa_with_cc;
create table indi_pub_downloads stored as parquet as create table indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats SELECT result_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id join publication on result_id=id
@ -350,6 +391,8 @@ where downloads>0
GROUP BY result_id GROUP BY result_id
order by no_dowloads desc; order by no_dowloads desc;
compute stats indi_pub_downloads;
create table indi_pub_downloads_datasource stored as parquet as create table indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats SELECT result_id, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats
join publication on result_id=id join publication on result_id=id
@ -357,15 +400,21 @@ where downloads>0
GROUP BY result_id, repository_id GROUP BY result_id, repository_id
order by result_id; order by result_id;
compute stats indi_pub_downloads_datasource;
create table indi_pub_downloads_year stored as parquet as create table indi_pub_downloads_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us SELECT result_id, substring(us.`date`, 1,4) as `year`, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id where downloads>0 join publication on result_id=id where downloads>0
GROUP BY result_id, `year` GROUP BY result_id, `year`
order by `year` asc; order by `year` asc;
compute stats indi_pub_downloads_year;
create table indi_pub_downloads_datasource_year stored as parquet as create table indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us SELECT result_id, substring(us.`date`, 1,4) as `year`, repository_id, sum(downloads) no_dowloads from openaire_prod_usage_stats.usage_stats us
join publication on result_id=id join publication on result_id=id
where downloads>0 where downloads>0
GROUP BY result_id, repository_id, `year` GROUP BY result_id, repository_id, `year`
order by `year` asc, result_id; order by `year` asc, result_id;
compute stats indi_pub_downloads_datasource_year;

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-raw-data-update</artifactId> <artifactId>dhp-usage-raw-data-update</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-usage-stats-build</artifactId> <artifactId>dhp-usage-stats-build</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version> <version>1.2.5-SNAPSHOT</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<licenses> <licenses>
@ -551,6 +551,9 @@
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version> <version>3.9.1</version>
<configuration>
<skip>${dhp.site.skip}</skip>
</configuration>
</plugin> </plugin>
<plugin> <plugin>
@ -791,6 +794,7 @@
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version> <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dhp.jackson.version>2.9.6</dhp.jackson.version> <dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.site.skip>true</dhp.site.skip>
<dhp.guava.version>11.0.2</dhp.guava.version> <dhp.guava.version>11.0.2</dhp.guava.version>
<scala.version>2.11.12</scala.version> <scala.version>2.11.12</scala.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version> <junit-jupiter.version>5.6.1</junit-jupiter.version>