StatsDB workflow to export actionsets about OA routes, diamond, and publicly-funded #355

Merged
claudio.atzori merged 4 commits from dimitris.pierrakos/dnet-hadoop:beta into beta 2023-12-01 15:03:58 +01:00
11 changed files with 835 additions and 0 deletions
Showing only changes of commit 89c4dfbaf4 - Show all commits

View File

@ -0,0 +1,114 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-stats-actionsets</artifactId>
<!-- <build>-->
dimitris.pierrakos marked this conversation as resolved
Review

If this block is commented out and it is not needed, then please remove it

If this block is commented out and it is not needed, then please remove it
<!-- <plugins>-->
<!-- <plugin>-->
<!-- <groupId>net.alchim31.maven</groupId>-->
<!-- <artifactId>scala-maven-plugin</artifactId>-->
<!-- <version>${net.alchim31.maven.version}</version>-->
<!-- <executions>-->
<!-- <execution>-->
<!-- <id>scala-compile-first</id>-->
<!-- <phase>initialize</phase>-->
<!-- <goals>-->
<!-- <goal>add-source</goal>-->
<!-- <goal>compile</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- <execution>-->
<!-- <id>scala-test-compile</id>-->
<!-- <phase>process-test-resources</phase>-->
<!-- <goals>-->
<!-- <goal>testCompile</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- <execution>-->
<!-- <id>scala-doc</id>-->
<!-- <phase>process-resources</phase> &lt;!&ndash; or wherever &ndash;&gt;-->
<!-- <goals>-->
<!-- <goal>doc</goal>-->
<!-- </goals>-->
<!-- </execution>-->
<!-- </executions>-->
<!-- <configuration>-->
<!-- <scalaVersion>${scala.version}</scalaVersion>-->
<!-- </configuration>-->
<!-- </plugin>-->
<!-- </plugins>-->
<!-- </build>-->
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-compress -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,102 @@
package eu.dnetlib.dhp.actionmanager;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants {
public static final String DOI = "doi";
public static final String DOI_CLASSNAME = "Digital Object Identifier";
public static final String DEFAULT_DELIMITER = ",";
public static final String DEFAULT_FOS_DELIMITER = "\t";
public static final String UPDATE_DATA_INFO_TYPE = "update";
// public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
dimitris.pierrakos marked this conversation as resolved Outdated

Consider also to remove the commented out code lines. They just pollute the code readability.

Consider also to remove the commented out code lines. They just pollute the code readability.
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
// public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
// public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
// public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts";
// public static final String UPDATE_KEY_USAGE_COUNTS = "count";
public static final String UPDATE_MEASURE_STATS_MODEL_CLASS_ID = "measure:stats_model";
public static final String UPDATE_KEY_STATS_MODEL = "stats_model";
// public static final String UPDATE_MEASURE_PUBLICLY_FUNDED_CLASS_ID = "measure:publicly_funded";
// public static final String UPDATE_KEY_PUBLICLY_FUNDED = "publicly_funded";
// public static final String FOS_CLASS_ID = "FOS";
// public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
//
// public static final String SDG_CLASS_ID = "SDG";
// public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
public static final String NULL = "NULL";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {
}
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
dimitris.pierrakos marked this conversation as resolved Outdated

I cannot see any usage in the codebase for this method. Consider to remove it.

I cannot see any usage in the codebase for this method. Consider to remove it.
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static <R> Dataset<R> readPath(
dimitris.pierrakos marked this conversation as resolved Outdated

I cannot see any usage in the codebase for this method. Consider to remove it.

I cannot see any usage in the codebase for this method. Consider to remove it.
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Subject getSubject(String sbj, String classid, String classname,
dimitris.pierrakos marked this conversation as resolved Outdated

I cannot see any usage in the codebase for this method. Consider to remove it.

I cannot see any usage in the codebase for this method. Consider to remove it.
String diqualifierclassid) {
if (sbj == null || sbj.equals(NULL))
return null;
Subject s = new Subject();
s.setValue(sbj);
s
.setQualifier(
OafMapperUtils
.qualifier(
classid,
classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
s
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
diqualifierclassid,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return s;
}
}

View File

@ -0,0 +1,286 @@
package eu.dnetlib.dhp.actionmanager.stats_actionsets;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/**
* created the Atomic Action for each type of results
*/
public class StatsAtomicActionsJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(StatsAtomicActionsJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
StatsAtomicActionsJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/stats_actionsets/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
final String dbname = parser.get("statsDB");
final String workingPath = parser.get("workingPath");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareGreenData(dbname, spark, workingPath + "/greenOADB", "indi_pub_green_oa", "id");
prepareDiamondData(dbname, spark, workingPath + "/diamondOADΒ", "indi_pub_diamond", "id");
preparePubliclyFundedData(
dbname, spark, workingPath + "/publiclyFundedDΒ", "indi_funded_result_with_fundref", "id");
prepareOAColourData(dbname, spark, workingPath + "/oacolourDB", "", "id");
writeActionSet(spark, workingPath, outputPath);
});
}
private static void prepareGreenData(String dbname, SparkSession spark, String workingPath, String tableName,
String resultAttributeName) {
spark
.sql(
String
.format(
"select %s as id, green_oa as green_oa " +
"from %s.%s",
resultAttributeName, dbname, tableName))
.as(Encoders.bean(StatsGreenOAModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
private static void prepareDiamondData(String dbname, SparkSession spark, String workingPath, String tableName,
String resultAttributeName) {
spark
.sql(
String
.format(
"select %s as id, in_diamond_journal as in_diamond_journal " +
"from %s.%s",
resultAttributeName, dbname, tableName))
.as(Encoders.bean(StatsDiamondOAModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
private static void preparePubliclyFundedData(String dbname, SparkSession spark, String workingPath,
String tableName,
String resultAttributeName) {
spark
.sql(
String
.format(
"select %s as id, fundref as publicly_funded " +
"from %s.%s",
resultAttributeName, dbname, tableName))
.as(Encoders.bean(StatsPubliclyFundedModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
private static void prepareOAColourData(String dbname, SparkSession spark, String workingPath, String tableName,
String resultAttributeName) {
spark
.sql(
String
.format(
"select b.%s as id, is_gold, is_bronze_oa, is_hybrid from %s.indi_pub_bronze_oa b " +
"left outer join %s.indi_pub_gold_oa g on g.id=b.id " +
dimitris.pierrakos marked this conversation as resolved Outdated

Please check the chain of the conditions, it seems the usm.isIs_gold() is checked twice, leading to different then blocks, one sets the OpenAccess color to bronze, the other to gold, which seems counter intuitive. I assume it might be not what you were expecting to write.

Please check the chain of the conditions, it seems the `usm.isIs_gold()` is checked twice, leading to different then blocks, one sets the OpenAccess color to bronze, the other to gold, which seems counter intuitive. I assume it might be not what you were expecting to write.
"left outer join %s.indi_pub_hybrid h on b.id=h.id",
resultAttributeName, dbname, dbname, dbname))
.as(Encoders.bean(StatsOAColourModel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
}
public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {
getFinalIndicatorsGreenResult(spark, inputPath + "/greenOADB")
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.union(
getFinalIndicatorsDiamondResult(spark, inputPath + "/diamondOADΒ")
dimitris.pierrakos marked this conversation as resolved
Review

Gitea warns me about the character B used in the string "/diamondOADΒ"

B [U+0392] is confusable with B [U+0042]

Any chance that it is a typo from a different keyboard setting?

Gitea warns me about the character `B` used in the string `"/diamondOADΒ"` ``` B [U+0392] is confusable with B [U+0042] ``` Any chance that it is a typo from a different keyboard setting?
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p)))
.union(
getFinalIndicatorsPubliclyFundedResult(spark, inputPath + "/publiclyFundedDΒ")
dimitris.pierrakos marked this conversation as resolved Outdated

The same as above, Gitea warns me about the character B used in the string "/publiclyFundedDΒ"

B [U+0392] is confusable with B [U+0042]

Any chance that it is a typo from a different keyboard setting?

The same as above, Gitea warns me about the character `B` used in the string `"/publiclyFundedDΒ"` ``` B [U+0392] is confusable with B [U+0042] ``` Any chance that it is a typo from a different keyboard setting?
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p)))
.union(
getFinalIndicatorsOAColourResult(spark, inputPath + "/oacolourDB")
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p)))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
dimitris.pierrakos marked this conversation as resolved Outdated

Consider to store the output as compressed using the org.apache.hadoop.io.compress.GzipCodec as

.saveAsHadoopFile(
    outputPath, 
    Text.class, 
    Text.class, 
    SequenceFileOutputFormat.class, 
    GzipCodec.class);
Consider to store the output as compressed using the `org.apache.hadoop.io.compress.GzipCodec` as ``` .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); ```
}
public static Measure newMeasureInstance(String id) {
Measure m = new Measure();
m.setId(id);
m.setUnit(new ArrayList<>());
return m;
}
private static Dataset<Result> getFinalIndicatorsGreenResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, StatsGreenOAModel.class)
.map((MapFunction<StatsGreenOAModel, Result>) usm -> {
Result r = new Result();
r.setId("50|" + usm.getId());
r.setMeasures(getMeasure(usm.isGreen_oa(), "green_oa"));
return r;
}, Encoders.bean(Result.class));
}
private static Dataset<Result> getFinalIndicatorsDiamondResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, StatsDiamondOAModel.class)
.map((MapFunction<StatsDiamondOAModel, Result>) usm -> {
Result r = new Result();
r.setId("50|" + usm.getId());
r.setMeasures(getMeasure(usm.isIn_diamond_journal(), "in_diamond_journal"));
return r;
}, Encoders.bean(Result.class));
}
private static Dataset<Result> getFinalIndicatorsPubliclyFundedResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, StatsPubliclyFundedModel.class)
.map((MapFunction<StatsPubliclyFundedModel, Result>) usm -> {
Result r = new Result();
r.setId("50|" + usm.getId());
r.setMeasures(getMeasure(usm.isPublicly_funded(), "publicly_funded"));
return r;
}, Encoders.bean(Result.class));
}
private static Dataset<Result> getFinalIndicatorsOAColourResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, StatsOAColourModel.class)
.map((MapFunction<StatsOAColourModel, Result>) usm -> {
Result r = new Result();
r.setId("50|" + usm.getId());
r.setMeasures(getMeasureOAColour(usm.isIs_gold(), usm.isIs_bronze_oa(), usm.isIs_hybrid()));
return r;
}, Encoders.bean(Result.class));
}
private static List<Measure> getMeasure(Boolean is_model_oa, String model_type) {
dimitris.pierrakos marked this conversation as resolved Outdated

As agreed during the last meeting, the information about isGreen, openAccessColor, isInDiamondJournal, publiclyFunded is included at the result level in a recent release of the dhp-schema module, version 4.17.1. Those fields are available at the result level and should be used instead of the generic measures, which have a different purpose.

As agreed during the last meeting, the information about `isGreen`, `openAccessColor`, `isInDiamondJournal`, `publiclyFunded` is included at the result level in a recent release of the `dhp-schema` module, version `4.17.1`. Those fields are available at the result level and should be used instead of the generic measures, which have a different purpose.
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_STATS_MODEL_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"");
return Arrays
.asList(
OafMapperUtils
.newMeasureInstance(model_type, String.valueOf(is_model_oa), UPDATE_KEY_STATS_MODEL, dataInfo));
}
private static List<Measure> getMeasureOAColour(Boolean is_gold, Boolean is_bronze_oa, Boolean is_hybrid) {
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_STATS_MODEL_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"");
return Arrays
.asList(
OafMapperUtils
.newMeasureInstance("is_gold", String.valueOf(is_gold), UPDATE_KEY_STATS_MODEL, dataInfo),
OafMapperUtils
.newMeasureInstance("is_bronze_oa", String.valueOf(is_bronze_oa), UPDATE_KEY_STATS_MODEL, dataInfo),
OafMapperUtils
.newMeasureInstance("is_hybrid", String.valueOf(is_hybrid), UPDATE_KEY_STATS_MODEL, dataInfo));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.actionmanager.stats_actionsets;
import java.io.Serializable;
/**
* @author dimitris.pierrakos
* @Date 30/10/23
*/
public class StatsDiamondOAModel implements Serializable {
private String id;
private boolean in_diamond_journal;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public boolean isIn_diamond_journal() {
return in_diamond_journal;
}
public void setIn_diamond_journal(boolean in_diamond_journal) {
this.in_diamond_journal = in_diamond_journal;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.actionmanager.stats_actionsets;
import java.io.Serializable;
/**
* @author dimitris.pierrakos
* @Date 30/10/23
*/
public class StatsGreenOAModel implements Serializable {
private String id;
private boolean green_oa;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public boolean isGreen_oa() {
return green_oa;
}
public void setGreen_oa(boolean green_oa) {
this.green_oa = green_oa;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.actionmanager.stats_actionsets;
import java.io.Serializable;
/**
* @author dimitris.pierrakos
* @Date 30/10/23
*/
public class StatsOAColourModel implements Serializable {
private String id;
private boolean is_gold;
private boolean is_bronze_oa;
private boolean is_hybrid;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public boolean isIs_gold() {
return is_gold;
}
public void setIs_gold(boolean is_gold) {
this.is_gold = is_gold;
}
public boolean isIs_bronze_oa() {
return is_bronze_oa;
}
public void setIs_bronze_oa(boolean is_bronze_oa) {
this.is_bronze_oa = is_bronze_oa;
}
public boolean isIs_hybrid() {
return is_hybrid;
}
public void setIs_hybrid(boolean is_hybrid) {
this.is_hybrid = is_hybrid;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.actionmanager.stats_actionsets;
import java.io.Serializable;
/**
* @author dimitris.pierrakos
* @Date 30/10/23
*/
public class StatsPubliclyFundedModel implements Serializable {
private String id;
private boolean publicly_funded;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public boolean isPublicly_funded() {
return publicly_funded;
}
public void setPublicly_funded(boolean publicly_funded) {
this.publicly_funded = publicly_funded;
}
}

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "hmu",
"paramLongName": "hive_metastore_uris",
"paramDescription": "the URI for the hive metastore",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "sdb",
"paramLongName": "statsDB",
"paramDescription": "the name of the stats db to be used",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the workingPath where to save the content of the usage_stats table",
"paramRequired": true
}
]

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,125 @@
<workflow-app name="StatsActionSets" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>statsDB</name>
<description>the name of the stats db to be used</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="atomicactionsStats"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="atomicactions">
dimitris.pierrakos marked this conversation as resolved
Review

If I well understand from the spark action name, this action would produce an actionset contaning the usage stats metrics (views / downloads). However, an oozie workflow responsible for exporting such information is already available under

dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/workflow.xml

Why would we need to duplicate it? Can you elaborate?

Furthermore, I see the tag points to eu.dnetlib.dhp.actionmanager.stats_actionsets.SparkAtomicActionGreenOAJob which does not exist in the classpath.

If I well understand from the spark action name, this action would produce an actionset contaning the usage stats metrics (views / downloads). However, an oozie workflow responsible for exporting such information is already available under ``` dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/workflow.xml ``` Why would we need to duplicate it? Can you elaborate? Furthermore, I see the <class> tag points to `eu.dnetlib.dhp.actionmanager.stats_actionsets.SparkAtomicActionGreenOAJob` which does not exist in the classpath.
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the usage stats count for results</name>
<!-- <class>eu.dnetlib.dhp.actionmanager.stats_actionsets.SparkAtomicActionUsageJob</class>-->
<class>eu.dnetlib.dhp.actionmanager.stats_actionsets.SparkAtomicActionGreenOAJob</class>
<jar>dhp-stats-actionsets-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--statsDB</arg><arg>${statsDB}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="atomicactionsStats">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the stats green_oa for results</name>
<!-- <class>eu.dnetlib.dhp.actionmanager.stats_actionsets.SparkAtomicActionUsageJob</class>-->
<class>eu.dnetlib.dhp.actionmanager.stats_actionsets.StatsAtomicActionsJob</class>
<jar>dhp-stats-actionsets-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--hive_metastore_uris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--statsDB</arg><arg>${statsDB}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,12 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.logger.org.apache.spark=FATAL
log4j.logger.org.spark_project=FATAL