dnet-hadoop/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociati...


package eu.dnetlib.dhp.orcidtoresultfromsemrel;

import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.sources.v2.reader.InputPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;

public class PrepareResultOrcidAssociationStep1 {
	private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);

	public static void main(String[] args) throws Exception {
		String jsonConf = IOUtils
			.toString(
				PrepareResultOrcidAssociationStep1.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
		parser.parseArgument(args);

		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String outputPath = parser.get("workingPath");
		log.info("outputPath: {}", outputPath);

		final String resultClassName = parser.get("resultTableName");
		log.info("resultTableName: {}", resultClassName);

		final List<String> allowedsemrel = Arrays
			.stream(parser.get("allowedsemrels").split(";"))
			.map(s -> s.toLowerCase())
			.collect(Collectors.toList());

		log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));

		final List<String> allowedPids = Arrays.asList(parser.get("allowedpids").split(";"));
		log.info("allowedPids: {}", new Gson().toJson(allowedPids));

		final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
		log.info("resultType: {}", resultType);

		Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);

		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				// removeOutputDir(spark, outputPath);
				prepareInfo(
					spark, inputPath, outputPath, resultType, resultClazz, allowedsemrel, allowedPids);
			});
	}

	private static <R extends Result> void prepareInfo(
		SparkSession spark,
		String inputPath,
		String outputPath,
		String resultType,
		Class<R> resultClazz,
		List<String> allowedsemrel,
		List<String> allowedPids) {

		final String inputResultPath = inputPath + "/" + resultType;

		Dataset<Relation> relation = readPath(spark, outputPath + "/relationSubset", Relation.class);

		log.info("Reading Graph table from: {}", inputResultPath);

		final String resultOutputPath = outputPath + "/resultSubset/" + resultType;

		readPath(spark, inputResultPath, resultClazz)
			.filter(
				(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible())
			.filter(
				(FilterFunction<R>) r -> Optional
					.ofNullable(r.getAuthor())
					.map(
						al -> al
							.stream()
							.anyMatch(
								a -> hasAllowedPid(a, allowedPids)))
					.orElse(false)

			)
			.write()
			.mode(SaveMode.Overwrite)
			.option("compression", "gzip")
			.json(resultOutputPath);

		Dataset<R> result = readPath(spark, resultOutputPath, resultClazz);

		// result.foreach((ForeachFunction<R>) r -> System.out.println(new ObjectMapper().writeValueAsString(r)));

		result
			.joinWith(relation, result.col("id").equalTo(relation.col("source")))
			.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {
				ResultOrcidList rol = new ResultOrcidList();
				rol.setResultId(t2._2().getTarget());
				List<AutoritativeAuthor> aal = new ArrayList<>();
				t2._1().getAuthor().stream().forEach(a -> {
					a.getPid().stream().forEach(p -> {
						if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {
							aal
								.add(
									AutoritativeAuthor
										.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));
						}
					});
				});
				rol.setAuthorList(aal);
				return rol;
			}, Encoders.bean(ResultOrcidList.class))
			.write()
			.option("compression", "gzip")
			.mode(SaveMode.Overwrite)
			.json(outputPath + "/" + resultType);

	}

	private static boolean hasAllowedPid(Author a, List<String> allowedPids) {
		Optional<List<StructuredProperty>> oPid = Optional.ofNullable(a.getPid());
		if (!oPid.isPresent()) {
			return false;
		}
		return oPid.get().stream().anyMatch(p -> allowedPids.contains(p.getQualifier().getClassid().toLowerCase()));

	}

}
- 2020-04-30 11:05:17 +02:00
refactoring 2020-04-16 15:53:34 +02:00			`package eu.dnetlib.dhp.orcidtoresultfromsemrel;`

refactoring 2020-04-23 12:35:49 +02:00			`import static eu.dnetlib.dhp.PropagationConstant.*;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;`
refactoring 2020-04-23 12:35:49 +02:00
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import java.util.ArrayList;`
refactoring 2020-04-23 12:35:49 +02:00			`import java.util.Arrays;`
			`import java.util.List;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import java.util.Optional;`
[Enrichment Step] get rid of hive 2022-04-12 11:26:48 +02:00			`import java.util.stream.Collectors;`
- 2020-04-30 11:05:17 +02:00
refactoring 2020-04-16 15:53:34 +02:00			`import org.apache.commons.io.IOUtils;`
			`import org.apache.spark.SparkConf;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import org.apache.spark.api.java.function.FilterFunction;`
[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00			`import org.apache.spark.api.java.function.ForeachFunction;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import org.apache.spark.api.java.function.MapFunction;`
refactoring 2020-04-16 15:53:34 +02:00			`import org.apache.spark.sql.Dataset;`
			`import org.apache.spark.sql.Encoders;`
heavy cleanup 2020-05-07 18:22:26 +02:00			`import org.apache.spark.sql.SaveMode;`
refactoring 2020-04-16 15:53:34 +02:00			`import org.apache.spark.sql.SparkSession;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import org.apache.spark.sql.sources.v2.reader.InputPartition;`
refactoring 2020-04-16 15:53:34 +02:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`

[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00			`import com.fasterxml.jackson.databind.ObjectMapper;`
- 2020-04-30 11:05:17 +02:00			`import com.google.gson.Gson;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
code formatting 2020-12-02 11:23:49 +01:00			`import eu.dnetlib.dhp.schema.common.ModelConstants;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import eu.dnetlib.dhp.schema.oaf.Author;`
- 2020-04-30 11:05:17 +02:00			`import eu.dnetlib.dhp.schema.oaf.Relation;`
			`import eu.dnetlib.dhp.schema.oaf.Result;`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`import eu.dnetlib.dhp.schema.oaf.StructuredProperty;`
			`import scala.Tuple2;`
- 2020-04-30 11:05:17 +02:00
first phase of data preparation. For each result type (parallel) it produces the possible updates 2020-04-16 15:58:42 +02:00			`public class PrepareResultOrcidAssociationStep1 {`
- 2020-04-30 11:05:17 +02:00			`private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);`

			`public static void main(String[] args) throws Exception {`
heavy cleanup 2020-05-07 18:22:26 +02:00			`String jsonConf = IOUtils`
- 2020-04-30 11:05:17 +02:00			`.toString(`
heavy cleanup 2020-05-07 18:22:26 +02:00			`PrepareResultOrcidAssociationStep1.class`
- 2020-04-30 11:05:17 +02:00			`.getResourceAsStream(`
			`"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));`

heavy cleanup 2020-05-07 18:22:26 +02:00			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);`
- 2020-04-30 11:05:17 +02:00			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = isSparkSessionManaged(parser);`
			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`String inputPath = parser.get("sourcePath");`
			`log.info("inputPath: {}", inputPath);`

[Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00			`final String outputPath = parser.get("workingPath");`
- 2020-04-30 11:05:17 +02:00			`log.info("outputPath: {}", outputPath);`

			`final String resultClassName = parser.get("resultTableName");`
			`log.info("resultTableName: {}", resultClassName);`

[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00			`final List<String> allowedsemrel = Arrays`
			`.stream(parser.get("allowedsemrels").split(";"))`
			`.map(s -> s.toLowerCase())`
			`.collect(Collectors.toList());`
[Enrichment Step] get rid of hive 2022-04-12 11:26:48 +02:00
- 2020-04-30 11:05:17 +02:00			`log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));`

[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`final List<String> allowedPids = Arrays.asList(parser.get("allowedpids").split(";"));`
			`log.info("allowedPids: {}", new Gson().toJson(allowedPids));`

- 2020-04-30 11:05:17 +02:00			`final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();`
			`log.info("resultType: {}", resultType);`

			`Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);`

			`SparkConf conf = new SparkConf();`

[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`runWithSparkSession(`
- 2020-04-30 11:05:17 +02:00			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
[Enrichment Step] get rid of hive 2022-04-14 08:50:37 +02:00			`// removeOutputDir(spark, outputPath);`
- 2020-04-30 11:05:17 +02:00			`prepareInfo(`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`spark, inputPath, outputPath, resultType, resultClazz, allowedsemrel, allowedPids);`
- 2020-04-30 11:05:17 +02:00			`});`
			`}`

			`private static <R extends Result> void prepareInfo(`
			`SparkSession spark,`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`String inputPath,`
			`String outputPath,`
			`String resultType,`
- 2020-04-30 11:05:17 +02:00			`Class<R> resultClazz,`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`List<String> allowedsemrel,`
			`List<String> allowedPids) {`

			`final String inputResultPath = inputPath + "/" + resultType;`
- 2020-04-30 11:05:17 +02:00
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`Dataset<Relation> relation = readPath(spark, outputPath + "/relationSubset", Relation.class);`
- 2020-04-30 11:05:17 +02:00
heavy cleanup 2020-05-07 18:22:26 +02:00			`log.info("Reading Graph table from: {}", inputResultPath);`
[Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:34:48 +02:00
[Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00			`final String resultOutputPath = outputPath + "/resultSubset/" + resultType;`

[Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:39:13 +02:00			`readPath(spark, inputResultPath, resultClazz)`
			`.filter(`
			`(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible())`
			`.filter(`
			`(FilterFunction<R>) r -> Optional`
			`.ofNullable(r.getAuthor())`
			`.map(`
			`al -> al`
			`.stream()`
			`.anyMatch(`
			`a -> hasAllowedPid(a, allowedPids)))`
			`.orElse(false)`

			`)`
			`.write()`
			`.mode(SaveMode.Overwrite)`
			`.option("compression", "gzip")`
[Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00			`.json(resultOutputPath);`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00
[Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00			`Dataset<R> result = readPath(spark, resultOutputPath, resultClazz);`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00
[Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00			`// result.foreach((ForeachFunction<R>) r -> System.out.println(new ObjectMapper().writeValueAsString(r)));`
[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`result`
[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00			`.joinWith(relation, result.col("id").equalTo(relation.col("source")))`
			`.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {`
			`ResultOrcidList rol = new ResultOrcidList();`
			`rol.setResultId(t2._2().getTarget());`
			`List<AutoritativeAuthor> aal = new ArrayList<>();`
			`t2._1().getAuthor().stream().forEach(a -> {`
			`a.getPid().stream().forEach(p -> {`
			`if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {`
			`aal`
			`.add(`
			`AutoritativeAuthor`
			`.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));`
			`}`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`});`
[Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00			`});`
			`rol.setAuthorList(aal);`
			`return rol;`
			`}, Encoders.bean(ResultOrcidList.class))`
			`.write()`
heavy cleanup 2020-05-07 18:22:26 +02:00			`.option("compression", "gzip")`
			`.mode(SaveMode.Overwrite)`
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`.json(outputPath + "/" + resultType);`
[Enrichment Step] get rid of hive 2022-04-12 11:26:48 +02:00
[Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00			`}`

			`private static boolean hasAllowedPid(Author a, List<String> allowedPids) {`
			`Optional<List<StructuredProperty>> oPid = Optional.ofNullable(a.getPid());`
			`if (!oPid.isPresent()) {`
			`return false;`
			`}`
			`return oPid.get().stream().anyMatch(p -> allowedPids.contains(p.getQualifier().getClassid().toLowerCase()));`

- 2020-04-30 11:05:17 +02:00			`}`
heavy cleanup 2020-05-07 18:22:26 +02:00
refactoring 2020-04-16 15:53:34 +02:00			`}`