dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/usagestats/SparkAtomicActionUsageJob.java


package eu.dnetlib.dhp.actionmanager.usagestats;

import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;

import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;

import eu.dnetlib.dhp.schema.common.MainEntityType;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.ObjectMapper;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;

/**
 * created the Atomic Action for each type of results
 */
public class SparkAtomicActionUsageJob implements Serializable {

	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);
	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

	public static <I extends Result> void main(String[] args) throws Exception {

		String jsonConfiguration = IOUtils
			.toString(
				SparkAtomicActionUsageJob.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

		parser.parseArgument(args);

		Boolean isSparkSessionManaged = Optional
			.ofNullable(parser.get("isSparkSessionManaged"))
			.map(Boolean::valueOf)
			.orElse(Boolean.TRUE);

		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath {}: ", outputPath);

		SparkConf conf = new SparkConf();
		conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));

		final String dbname = parser.get("usagestatsdb");

		final String workingPath = parser.get("workingPath");

		runWithSparkHiveSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				removeOutputDir(spark, outputPath);
				prepareData(dbname, spark, workingPath + "/usageDb", "usage_stats", "result_id");
				prepareData(dbname, spark, workingPath + "/projectDb", "project_stats", "id");
				prepareData(dbname, spark, workingPath + "/datasourceDb", "datasource_stats", "repositor_id");
				writeActionSet(spark, workingPath, outputPath);
			});
	}

	private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName, String attribute_name) {
		spark
				.sql(
						"Select " + attribute_name + " as id, downloads, views " +
								"from " + dbname + "." + tableName)
				.as(Encoders.bean(UsageStatsModel.class))
				.write()
				.mode(SaveMode.Overwrite)
				.option("compression", "gzip")
				.json(workingPath);
	}


	public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {
		getFinalIndicatorsResult(spark, inputPath+ "/usageDb").
		toJavaRDD().
				map(p -> new AtomicAction(p.getClass(),p))
						.union(getFinalIndicatorsProject(spark, inputPath + "/projectDb")
								.toJavaRDD()
								.map(p -> new AtomicAction(p.getClass(), p )))
								.union(getFinalIndicatorsDatasource(spark, inputPath + "/datasourceDb")
										.toJavaRDD()
										.map(p -> new AtomicAction(p.getClass(), p)))
			.mapToPair(
				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);

	}

	private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {

		return getUsageStatsModelDataset(spark, inputPath)
				.map((MapFunction<UsageStatsModel, Result>) usm -> {
					Result r = new Result();
					r.setId("50|" + usm.getId());
					r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
					return r;
				}, Encoders.bean(Result.class));
	}

	private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {

		return getUsageStatsModelDataset(spark, inputPath)
				.map((MapFunction<UsageStatsModel, Project>) usm -> {
					Project r = new Project();
					r.setId("40|" + usm.getId());
					r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
					return r;
				}, Encoders.bean(Project.class));
	}

	private static Dataset<Datasource> getFinalIndicatorsDatasource(SparkSession spark, String inputPath) {

		return getUsageStatsModelDataset(spark, inputPath)
				.map((MapFunction<UsageStatsModel, Datasource>) usm -> {
					Datasource r = new Datasource();
					r.setId("10|" + usm.getId());
					r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));
					return r;
				}, Encoders.bean(Datasource.class));
	}

	private static Dataset<UsageStatsModel> getUsageStatsModelDataset(SparkSession spark, String inputPath) {
		return readPath(spark, inputPath, UsageStatsModel.class)
				.groupByKey((MapFunction<UsageStatsModel, String>) us -> us.getId(), Encoders.STRING())
				.mapGroups((MapGroupsFunction<String, UsageStatsModel, UsageStatsModel>) (k, it) -> {
					UsageStatsModel first = it.next();
					it.forEachRemaining(us -> {
						first.setDownloads(first.getDownloads() + us.getDownloads());
						first.setViews(first.getViews() + us.getViews());
					});
					first.setId(k);
					return first;

				}, Encoders.bean(UsageStatsModel.class));
	}

	private static List<Measure> getMeasure(Long downloads, Long views) {
		DataInfo dataInfo = OafMapperUtils
			.dataInfo(
				false,
				UPDATE_DATA_INFO_TYPE,
				true,
				false,
				OafMapperUtils
					.qualifier(
						UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,
						UPDATE_CLASS_NAME,
						ModelConstants.DNET_PROVENANCE_ACTIONS,
						ModelConstants.DNET_PROVENANCE_ACTIONS),
				"");

		return Arrays
			.asList(
				OafMapperUtils
					.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),
				OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));

	}

	private static void removeOutputDir(SparkSession spark, String path) {
		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
	}

	public static <R> Dataset<R> readPath(
		SparkSession spark, String inputPath, Class<R> clazz) {
		return spark
			.read()
			.textFile(inputPath)
			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
	}

}
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00
			`package eu.dnetlib.dhp.actionmanager.usagestats;`

			`import static eu.dnetlib.dhp.actionmanager.Constants.*;`
			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;`

			`import java.io.Serializable;`
			`import java.util.Arrays;`
			`import java.util.List;`
			`import java.util.Optional;`

[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00			`import eu.dnetlib.dhp.schema.common.MainEntityType;`
			`import eu.dnetlib.dhp.schema.oaf.*;`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`import org.apache.commons.io.IOUtils;`
[UsageCount] make it as an action set as it should be, plus changed the test to make them work as well now 2022-05-09 12:51:35 +02:00			`import org.apache.hadoop.io.Text;`
			`import org.apache.hadoop.mapred.SequenceFileOutputFormat;`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`import org.apache.spark.SparkConf;`
			`import org.apache.spark.api.java.function.MapFunction;`
			`import org.apache.spark.api.java.function.MapGroupsFunction;`
			`import org.apache.spark.sql.Dataset;`
			`import org.apache.spark.sql.Encoders;`
			`import org.apache.spark.sql.SaveMode;`
			`import org.apache.spark.sql.SparkSession;`
			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

			`import com.fasterxml.jackson.databind.ObjectMapper;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
			`import eu.dnetlib.dhp.common.HdfsSupport;`
[UsageCount] refactoring 2022-05-09 14:43:27 +02:00			`import eu.dnetlib.dhp.schema.action.AtomicAction;`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`import eu.dnetlib.dhp.schema.common.ModelConstants;`
			`import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;`
[UsageCount] make it as an action set as it should be, plus changed the test to make them work as well now 2022-05-09 12:51:35 +02:00			`import scala.Tuple2;`
[Measures] put the logic in common, no need to change the schema 2022-04-21 11:27:26 +02:00
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`/**`
[Measures] removed typo 2022-04-21 12:14:03 +02:00			`* created the Atomic Action for each type of results`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`*/`
			`public class SparkAtomicActionUsageJob implements Serializable {`

			`private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionUsageJob.class);`
			`private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();`

			`public static <I extends Result> void main(String[] args) throws Exception {`

			`String jsonConfiguration = IOUtils`
			`.toString(`
			`SparkAtomicActionUsageJob.class`
			`.getResourceAsStream(`
			`"/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json"));`

			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);`

			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = Optional`
			`.ofNullable(parser.get("isSparkSessionManaged"))`
			`.map(Boolean::valueOf)`
			`.orElse(Boolean.TRUE);`

			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`final String outputPath = parser.get("outputPath");`
			`log.info("outputPath {}: ", outputPath);`

			`SparkConf conf = new SparkConf();`
			`conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));`

[Measures] addressed comments in the PR 2022-04-21 12:09:37 +02:00			`final String dbname = parser.get("usagestatsdb");`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00
			`final String workingPath = parser.get("workingPath");`

			`runWithSparkHiveSession(`
			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
			`removeOutputDir(spark, outputPath);`
[UsageCount] fixed query 2023-02-10 15:50:56 +01:00			`prepareData(dbname, spark, workingPath + "/usageDb", "usage_stats", "result_id");`
			`prepareData(dbname, spark, workingPath + "/projectDb", "project_stats", "id");`
			`prepareData(dbname, spark, workingPath + "/datasourceDb", "datasource_stats", "repositor_id");`
[UsageCount] make it as an action set as it should be, plus changed the test to make them work as well now 2022-05-09 12:51:35 +02:00			`writeActionSet(spark, workingPath, outputPath);`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`});`
			`}`

[UsageCount] fixed query 2023-02-10 15:50:56 +01:00			`private static void prepareData(String dbname, SparkSession spark, String workingPath, String tableName, String attribute_name) {`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`spark`
[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00			`.sql(`
[UsageCount] fixed query 2023-02-10 15:50:56 +01:00			`"Select " + attribute_name + " as id, downloads, views " +`
[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00			`"from " + dbname + "." + tableName)`
			`.as(Encoders.bean(UsageStatsModel.class))`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
			`.json(workingPath);`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`}`

[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00

[UsageCount] make it as an action set as it should be, plus changed the test to make them work as well now 2022-05-09 12:51:35 +02:00			`public static void writeActionSet(SparkSession spark, String inputPath, String outputPath) {`
[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00			`getFinalIndicatorsResult(spark, inputPath+ "/usageDb").`
			`toJavaRDD().`
			`map(p -> new AtomicAction(p.getClass(),p))`
			`.union(getFinalIndicatorsProject(spark, inputPath + "/projectDb")`
			`.toJavaRDD()`
			`.map(p -> new AtomicAction(p.getClass(), p )))`
			`.union(getFinalIndicatorsDatasource(spark, inputPath + "/datasourceDb")`
			`.toJavaRDD()`
			`.map(p -> new AtomicAction(p.getClass(), p)))`
[UsageCount] refactoring 2022-05-09 14:43:27 +02:00			`.mapToPair(`
			`aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),`
			`new Text(OBJECT_MAPPER.writeValueAsString(aa))))`
			`.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);`
[UsageCount] make it as an action set as it should be, plus changed the test to make them work as well now 2022-05-09 12:51:35 +02:00
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`}`

[UsageCount] addition of usagecount for Projects and datasources. Extention of the action set created for the results with new entities for projects and datasources. Extention of the resource set and modification of the testing class 2023-02-09 18:59:45 +01:00			`private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {`

			`return getUsageStatsModelDataset(spark, inputPath)`
			`.map((MapFunction<UsageStatsModel, Result>) usm -> {`
			`Result r = new Result();`
			`r.setId("50\|" + usm.getId());`
			`r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));`
			`return r;`
			`}, Encoders.bean(Result.class));`
			`}`

			`private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {`

			`return getUsageStatsModelDataset(spark, inputPath)`
			`.map((MapFunction<UsageStatsModel, Project>) usm -> {`
			`Project r = new Project();`
			`r.setId("40\|" + usm.getId());`
			`r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));`
			`return r;`
			`}, Encoders.bean(Project.class));`
			`}`

			`private static Dataset<Datasource> getFinalIndicatorsDatasource(SparkSession spark, String inputPath) {`

			`return getUsageStatsModelDataset(spark, inputPath)`
			`.map((MapFunction<UsageStatsModel, Datasource>) usm -> {`
			`Datasource r = new Datasource();`
			`r.setId("10\|" + usm.getId());`
			`r.setMeasures(getMeasure(usm.getDownloads(), usm.getViews()));`
			`return r;`
			`}, Encoders.bean(Datasource.class));`
			`}`

			`private static Dataset<UsageStatsModel> getUsageStatsModelDataset(SparkSession spark, String inputPath) {`
			`return readPath(spark, inputPath, UsageStatsModel.class)`
			`.groupByKey((MapFunction<UsageStatsModel, String>) us -> us.getId(), Encoders.STRING())`
			`.mapGroups((MapGroupsFunction<String, UsageStatsModel, UsageStatsModel>) (k, it) -> {`
			`UsageStatsModel first = it.next();`
			`it.forEachRemaining(us -> {`
			`first.setDownloads(first.getDownloads() + us.getDownloads());`
			`first.setViews(first.getViews() + us.getViews());`
			`});`
			`first.setId(k);`
			`return first;`

			`}, Encoders.bean(UsageStatsModel.class));`
			`}`

[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00			`private static List<Measure> getMeasure(Long downloads, Long views) {`
			`DataInfo dataInfo = OafMapperUtils`
			`.dataInfo(`
			`false,`
			`UPDATE_DATA_INFO_TYPE,`
			`true,`
			`false,`
			`OafMapperUtils`
			`.qualifier(`
			`UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID,`
			`UPDATE_CLASS_NAME,`
			`ModelConstants.DNET_PROVENANCE_ACTIONS,`
			`ModelConstants.DNET_PROVENANCE_ACTIONS),`
			`"");`

			`return Arrays`
			`.asList(`
[Measures] addressed comments in the PR 2022-04-21 12:09:37 +02:00			`OafMapperUtils`
			`.newMeasureInstance("downloads", String.valueOf(downloads), UPDATE_KEY_USAGE_COUNTS, dataInfo),`
			`OafMapperUtils.newMeasureInstance("views", String.valueOf(views), UPDATE_KEY_USAGE_COUNTS, dataInfo));`
[Measures] added new measure (usagecounts) as action set. Measure added at the level of the result. Ref #7587 2022-04-20 14:02:05 +02:00
			`}`

			`private static void removeOutputDir(SparkSession spark, String path) {`
			`HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());`
			`}`

			`public static <R> Dataset<R> readPath(`
			`SparkSession spark, String inputPath, Class<R> clazz) {`
			`return spark`
			`.read()`
			`.textFile(inputPath)`
			`.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));`
			`}`

			`}`