dnet-hadoop/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociati...


package eu.dnetlib.dhp.orcidtoresultfromsemrel;

import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.ObjectMapper;

import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import scala.Tuple2;

public class PrepareResultOrcidAssociationStep2 {
	private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);

	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

	public static void main(String[] args) throws Exception {
		String jsonConfiguration = IOUtils
			.toString(
				PrepareResultOrcidAssociationStep2.class
					.getResourceAsStream(
						"/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));

		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

		parser.parseArgument(args);

		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

		String inputPath = parser.get("sourcePath");
		log.info("inputPath: {}", inputPath);

		final String outputPath = parser.get("outputPath");
		log.info("outputPath: {}", outputPath);

		SparkConf conf = new SparkConf();

		runWithSparkSession(
			conf,
			isSparkSessionManaged,
			spark -> {
				removeOutputDir(spark, outputPath);
				mergeInfo(spark, inputPath, outputPath);
			});
	}

	private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {

		Dataset<ResultOrcidList> resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)
			.union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))
			.union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))
			.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));

		resultOrcidAssoc
			.toJavaRDD()
			.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
			.reduceByKey(
				(a, b) -> {
					if (a == null) {
						return b;
					}
					if (b == null) {
						return a;
					}
					Set<String> orcid_set = new HashSet<>();
					a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
					b
						.getAuthorList()
						.stream()
						.forEach(
							aa -> {
								if (!orcid_set.contains(aa.getOrcid())) {
									a.getAuthorList().add(aa);
									orcid_set.add(aa.getOrcid());
								}
							});
					return a;
				})
			.map(Tuple2::_2)
			.map(r -> OBJECT_MAPPER.writeValueAsString(r))
			.saveAsTextFile(outputPath, GzipCodec.class);
	}

}
- 2020-04-30 11:05:17 +02:00
refactoring 2020-04-16 15:53:34 +02:00			`package eu.dnetlib.dhp.orcidtoresultfromsemrel;`

refactoring 2020-04-23 12:35:49 +02:00			`import static eu.dnetlib.dhp.PropagationConstant.*;`
			`import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;`

			`import java.util.HashSet;`
			`import java.util.Set;`
- 2020-04-30 11:05:17 +02:00
refactoring 2020-04-16 15:53:34 +02:00			`import org.apache.commons.io.IOUtils;`
seconf phase of data preparation. Groups all the possible updates by id 2020-04-16 16:08:51 +02:00			`import org.apache.hadoop.io.compress.GzipCodec;`
refactoring 2020-04-16 15:53:34 +02:00			`import org.apache.spark.SparkConf;`
seconf phase of data preparation. Groups all the possible updates by id 2020-04-16 16:08:51 +02:00			`import org.apache.spark.sql.*;`
refactoring 2020-04-16 15:53:34 +02:00			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`
- 2020-04-30 11:05:17 +02:00
			`import com.fasterxml.jackson.databind.ObjectMapper;`

			`import eu.dnetlib.dhp.application.ArgumentApplicationParser;`
seconf phase of data preparation. Groups all the possible updates by id 2020-04-16 16:08:51 +02:00			`import scala.Tuple2;`
refactoring 2020-04-16 15:53:34 +02:00
seconf phase of data preparation. Groups all the possible updates by id 2020-04-16 16:08:51 +02:00			`public class PrepareResultOrcidAssociationStep2 {`
- 2020-04-30 11:05:17 +02:00			`private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class);`

			`private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();`

			`public static void main(String[] args) throws Exception {`
			`String jsonConfiguration = IOUtils`
			`.toString(`
			`PrepareResultOrcidAssociationStep2.class`
			`.getResourceAsStream(`
added properties file in the forlder for the workflow of orcid propagation. Changes the path in the classes implementing the propagationchanged the path to the parameter file in the class for entitytoorganization propagation 2023-12-22 11:42:09 +01:00			`"/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));`
- 2020-04-30 11:05:17 +02:00
			`final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);`

			`parser.parseArgument(args);`

			`Boolean isSparkSessionManaged = isSparkSessionManaged(parser);`
			`log.info("isSparkSessionManaged: {}", isSparkSessionManaged);`

			`String inputPath = parser.get("sourcePath");`
			`log.info("inputPath: {}", inputPath);`

			`final String outputPath = parser.get("outputPath");`
			`log.info("outputPath: {}", outputPath);`

			`SparkConf conf = new SparkConf();`

			`runWithSparkSession(`
			`conf,`
			`isSparkSessionManaged,`
			`spark -> {`
added global properties in wf definitions to avoid repeating name-node and job-tracker in the (many) distcp actions; reintroduced output directory removal at the beginning of each spark action 2020-05-14 10:25:41 +02:00			`removeOutputDir(spark, outputPath);`
- 2020-04-30 11:05:17 +02:00			`mergeInfo(spark, inputPath, outputPath);`
			`});`
			`}`

			`private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) {`

heavy cleanup 2020-05-07 18:22:26 +02:00			`Dataset<ResultOrcidList> resultOrcidAssoc = readPath(spark, inputPath + "/publication", ResultOrcidList.class)`
			`.union(readPath(spark, inputPath + "/dataset", ResultOrcidList.class))`
			`.union(readPath(spark, inputPath + "/otherresearchproduct", ResultOrcidList.class))`
			`.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));`
- 2020-04-30 11:05:17 +02:00
			`resultOrcidAssoc`
			`.toJavaRDD()`
			`.mapToPair(r -> new Tuple2<>(r.getResultId(), r))`
			`.reduceByKey(`
			`(a, b) -> {`
			`if (a == null) {`
			`return b;`
			`}`
			`if (b == null) {`
			`return a;`
			`}`
			`Set<String> orcid_set = new HashSet<>();`
			`a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));`
			`b`
			`.getAuthorList()`
			`.stream()`
			`.forEach(`
			`aa -> {`
			`if (!orcid_set.contains(aa.getOrcid())) {`
			`a.getAuthorList().add(aa);`
			`orcid_set.add(aa.getOrcid());`
			`}`
			`});`
			`return a;`
			`})`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`.map(Tuple2::_2)`
- 2020-04-30 11:05:17 +02:00			`.map(r -> OBJECT_MAPPER.writeValueAsString(r))`
			`.saveAsTextFile(outputPath, GzipCodec.class);`
			`}`

refactoring 2020-04-16 15:53:34 +02:00			`}`