Merge branch '8172_impact_indicators_workflow' of https://code-repo.d4science.org/D-Net/dnet-hadoop into 8172_impact_indicators_workflow

2023-04-28 13:23:49 +03:00 · 2023-04-28 13:23:49 +03:00 · a98da54896
parent 09485fbee3 614cc1089b
commit a98da54896
19 changed files with 474 additions and 98 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -9,6 +9,7 @@ import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@ -40,7 +41,8 @@ import scala.Tuple2;
 */
 public class SparkAtomicActionScoreJob implements Serializable {
-	private static final String DOI = "doi";
+	private static final String RESULT = "result";
 	private static final String PROJECT = "project";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -56,18 +58,17 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		parser.parseArgument(args);
-		Boolean isSparkSessionManaged = Optional
+		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 		final String inputPath = parser.get("inputPath");
-		log.info("inputPath {}: ", inputPath);
+		log.info("inputPath: {}", inputPath);
 		final String outputPath = parser.get("outputPath");
-		log.info("outputPath {}: ", outputPath);
+		log.info("outputPath: {}", outputPath);
 		final String targetEntity = parser.get("targetEntity");
 		log.info("targetEntity: {}", targetEntity);
 		SparkConf conf = new SparkConf();
@ -76,17 +77,48 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
-				prepareResults(spark, inputPath, outputPath);
+
-			});
+				// follow different procedures for different target entities
 				switch (targetEntity) {
 					case RESULT:
 						prepareResults(spark, inputPath, outputPath);
 						break;
 					case PROJECT:
 						prepareProjects(spark, inputPath, outputPath);
 						break;
 					default:
 						throw new RuntimeException("Unknown target entity: " + targetEntity);
 				}
 			}
 		);
 	}
 	private static <I extends Project> void prepareProjects(SparkSession spark, String inputPath, String outputPath) {
 		// read input bip project scores
 		Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
 		projectScores.map( (MapFunction<BipProjectModel, Project>) bipProjectScores -> {
 			Project project = new Project();
 			project.setId(bipProjectScores.getProjectId());
 			project.setMeasures(bipProjectScores.toMeasures());
 			return project;
 		}, Encoders.bean(Project.class))
 		.toJavaRDD()
 		.map(p -> new AtomicAction(Project.class, p))
 		.mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
 				new Text(OBJECT_MAPPER.writeValueAsString(aa))))
 		.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 	}
 	private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(bipScorePath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
 		Dataset<BipScore> bipScores = spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
@ -159,12 +191,4 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 	public static <R> Dataset<R> readPath(
 		SparkSession spark, String inputPath, Class<R> clazz) {
 		return spark
 			.read()
 			.textFile(inputPath)
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
@ -0,0 +1,69 @@
 package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
 import com.opencsv.bean.CsvBindByPosition;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import lombok.AllArgsConstructor;
 import lombok.Getter;
 import lombok.NoArgsConstructor;
 import lombok.Setter;
 import eu.dnetlib.dhp.schema.oaf.Measure;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import static eu.dnetlib.dhp.actionmanager.Constants.*;
@NoArgsConstructor
@AllArgsConstructor
@Getter
@Setter
 public class BipProjectModel {
    String projectId;
    String numOfInfluentialResults;
    String numOfPopularResults;
    String totalImpulse;
    String totalCitationCount;
    // each project bip measure has exactly one value, hence one key-value pair
    private Measure createMeasure(String measureId, String measureValue) {
        KeyValue kv = new KeyValue();
        kv.setKey("score");
        kv.setValue(measureValue);
        kv.setDataInfo(
            OafMapperUtils.dataInfo(
                false,
                UPDATE_DATA_INFO_TYPE,
                true,
                false,
                OafMapperUtils.qualifier(
                    UPDATE_MEASURE_BIP_CLASS_ID,
                    UPDATE_CLASS_NAME,
                    ModelConstants.DNET_PROVENANCE_ACTIONS,
                    ModelConstants.DNET_PROVENANCE_ACTIONS),
    "")
        );
        Measure measure = new Measure();
        measure.setId(measureId);
        measure.setUnit(Collections.singletonList(kv));
        return measure;
    }
    public List<Measure> toMeasures() {
        return Arrays.asList(
            createMeasure("numOfInfluentialResults", numOfInfluentialResults),
            createMeasure("numOfPopularResults", numOfPopularResults),
            createMeasure("totalImpulse", totalImpulse),
            createMeasure("totalCitationCount", totalCitationCount)
        );
    }
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
@ -1,5 +1,7 @@
-package eu.dnetlib.dhp.actionmanager.bipmodel;
+package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
 import eu.dnetlib.dhp.actionmanager.bipmodel.Score;
 import java.io.Serializable;
 import java.util.ArrayList;
@ -11,9 +13,9 @@ import java.util.List;
 * Only needed for deserialization purposes
 */
-public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
+public class BipResultModel extends HashMap<String, List<Score>> implements Serializable {
-	public BipDeserialize() {
+	public BipResultModel() {
 		super();
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@ -24,7 +24,7 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@ -82,9 +82,9 @@ public class PrepareBipFinder implements Serializable {
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(inputPath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
 		spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@ -16,5 +16,11 @@
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
  },
  {
    "paramName": "te",
    "paramLongName": "targetEntity",
    "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }",
    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -6,8 +6,9 @@ import static org.junit.jupiter.api.Assertions.*;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.Project;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@ -27,7 +28,6 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class SparkAtomicActionScoreJobTest {
@ -37,8 +37,11 @@ public class SparkAtomicActionScoreJobTest {
 	private static SparkSession spark;
 	private static Path workingDir;
-	private static final Logger log = LoggerFactory
+
-		.getLogger(SparkAtomicActionScoreJobTest.class);
+	private final static String RESULT = "result";
 	private final static String PROJECT = "project";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class);
 	@BeforeAll
 	public static void beforeAll() throws IOException {
@ -69,29 +72,31 @@ public class SparkAtomicActionScoreJobTest {
 		spark.stop();
 	}
 	private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception {
 		SparkAtomicActionScoreJob.main(
 			new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-inputPath", inputPath,
 					"-outputPath", outputPath,
 					"-targetEntity", targetEntity,
 			}
 		);
 	}
 	@Test
-	void testMatch() throws Exception {
+	void testResultScores() throws Exception {
-		String bipScoresPath = getClass()
+		final String targetEntity = RESULT;
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
+		String inputResultScores = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json")
 			.getPath();
 		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
-		SparkAtomicActionScoreJob
+		// execute the job to generate the action sets for result scores
-			.main(
+		runJob(inputResultScores, outputPath, targetEntity);
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					bipScoresPath,
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Result> tmp = sc
-			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.sequenceFile(outputPath, Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Result) aa.getPayload()));
@ -140,4 +145,61 @@ public class SparkAtomicActionScoreJobTest {
 	}
 	@Test
 	void testProjectScores() throws Exception {
 		String targetEntity = PROJECT;
 		String inputResultScores = getClass()
 				.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
 				.getPath();
 		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
 		// execute the job to generate the action sets for project scores
 		runJob(inputResultScores, outputPath, PROJECT);
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Project> projects = sc
 				.sequenceFile(outputPath, Text.class, Text.class)
 				.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 				.map(aa -> ((Project) aa.getPayload()));
 		// test the number of projects
 		assertEquals(4, projects.count());
 		String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833";
 		// count that the project with id testProjectId is present
 		assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count());
 		projects.filter(row -> row.getId().equals(testProjectId))
 			.flatMap(r -> r.getMeasures().iterator())
 			.foreach(m -> {
 				log.info(m.getId() + " " + m.getUnit());
 				// ensure that only one score is present for each bip impact measure
 				assertEquals(1, m.getUnit().size());
 				KeyValue kv = m.getUnit().get(0);
 				// ensure that the correct key is provided, i.e. score
 				assertEquals("score", kv.getKey());
 				switch(m.getId()) {
 					case "numOfInfluentialResults":
 						assertEquals("0", kv.getValue());
 						break;
 					case "numOfPopularResults":
 						assertEquals("1", kv.getValue());
 						break;
 					case "totalImpulse":
 						assertEquals("25", kv.getValue());
 						break;
 					case "totalCitationCount":
 						assertEquals("43", kv.getValue());
 						break;
 					default:
 						fail("Unknown measure id in the context of projects");
 				}
 			});
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
@ -0,0 +1,4 @@
 {"projectId":"40|nsf_________::d93e50d22374a1cf59f6a232413ea027","numOfInfluentialResults":0,"numOfPopularResults":10,"totalImpulse":181,"totalCitationCount":235}
 {"projectId":"40|nih_________::1c93debc7085e440f245fbe70b2e8b21","numOfInfluentialResults":14,"numOfPopularResults":17,"totalImpulse":1558,"totalCitationCount":4226}
 {"projectId":"40|nih_________::c02a8233e9b60f05bb418f0c9b714833","numOfInfluentialResults":0,"numOfPopularResults":1,"totalImpulse":25,"totalCitationCount":43}
 {"projectId":"40|corda_______::d91dcf3a87dd7f72248fab0b8a4ba273","numOfInfluentialResults":2,"numOfPopularResults":3,"totalImpulse":78,"totalCitationCount":178}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
--- a/dhp-workflows/dhp-impact-indicators/README.md
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@ -1,4 +1,4 @@
-# Ranking Workflow for Openaire Publications
+# Ranking Workflow for OpenAIRE Publications
 This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie.
 All scripts are written in python and the project setup follows the typical oozie workflow structure:
@ -7,17 +7,15 @@ All scripts are written in python and the project setup follows the typical oozi
 - a job.properties file specifying parameter values for the parameters used by the workflow
 - a set of python scripts used by the workflow
-**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker.
+**NOTE**: the workflow depends on the external library of ranking scripts called [BiP! Ranker](https://github.com/athenarc/Bip-Ranker).
 You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section.
-## Check out a specific tag/release of BIP-Ranker
+## Build and deploy
-* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
+Use the following command for packaging:
 * Then, use maven to perform the checkout:
 ```
-mvn scm:checkout
+mvn package  -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
 ```
-* The code should be visible under `src/main/bip-ranker` folder.
+Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.
--- a/dhp-workflows/dhp-impact-indicators/pom.xml
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@ -5,9 +5,8 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
-        <artifactId>dhp</artifactId>
+        <artifactId>dhp-workflows</artifactId>
        <version>1.2.5-SNAPSHOT</version>
        <relativePath>../pom.xml</relativePath>
    </parent>
    <artifactId>dhp-impact-indicators</artifactId>
@ -16,6 +15,9 @@
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <!--   Use this property to fetch a specific tag      -->
        <bip.ranker.tag>v1.0.0</bip.ranker.tag>
    </properties>
    <scm>
@ -32,10 +34,29 @@
                <configuration>
                    <connectionType>connection</connectionType>
                    <scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
-                    <scmVersion>v1.0.0</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
+                    <scmVersion>${bip.ranker.tag}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
-                    <checkoutDirectory>${project.build.directory}/../src/main/bip-ranker</checkoutDirectory>
+                    <checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/bip-ranker</checkoutDirectory>
                </configuration>
                <executions>
                    <execution>
                        <id>checkout-bip-ranker</id>
                        <phase>prepare-package</phase>
                        <goals>
                            <goal>checkout</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
    <dependencies>
        <dependency>
            <groupId>eu.dnetlib.dhp</groupId>
            <artifactId>dhp-aggregation</artifactId>
            <version>${projectVersion}</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>
 </project>
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@ -90,3 +90,6 @@ oozie.wf.application.path=${wfAppPath}
 # Path where the final output should be?
 actionSetOutputPath=${workflowDataDir}/bip_actionsets/
 # The directory to store project impact indicators
 projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
@ -0,0 +1,108 @@
 import sys
 from pyspark.sql import SparkSession
 from pyspark import SparkConf, SparkContext
 import pyspark.sql.functions as F
 from pyspark.sql.types import StringType, IntegerType, StructType, StructField
 if len(sys.argv) < 8:
    print("Usage: projects_impact.py <relations_folder> <influence_file> <popularity_file> <cc_file> <impulse_file> <num_partitions> <output_dir>")
    sys.exit(-1)
 appName = 'Project Impact Indicators'
 conf = SparkConf().setAppName(appName)
 sc = SparkContext(conf = conf)
 spark = SparkSession.builder.appName(appName).getOrCreate()
 sc.setLogLevel('OFF')
 # input parameters
 relations_fd = sys.argv[1]
 influence_fd = sys.argv[2]
 popularity_fd = sys.argv[3]
 cc_fd = sys.argv[4]
 impulse_fd = sys.argv[5]
 num_partitions = int(sys.argv[6])
 output_dir = sys.argv[7]
 # schema for impact indicator files
 impact_files_schema = StructType([
    StructField('resultId', StringType(), False),
    StructField('score', IntegerType(), False),
    StructField('class', StringType(), False),
 ])
 # list of impact indicators
 impact_indicators = [
    ('influence', influence_fd, 'class'),
    ('popularity', popularity_fd, 'class'),
    ('impulse', impulse_fd, 'score'),
    ('citation_count', cc_fd, 'score')
 ]
 '''
    * Read impact indicator file and return a dataframe with the following schema:
    *   resultId: String
    *   indicator_name: Integer
 '''
 def read_df(fd, indicator_name, column_name):
    return spark.read.schema(impact_files_schema)\
        .option('delimiter', '\t')\
        .option('header', False)\
        .csv(fd)\
        .select('resultId', F.col(column_name).alias(indicator_name))\
        .repartition(num_partitions, 'resultId')
 # Print dataframe schema, first 5 rows, and count
 def print_df(df):
    df.show(50)
    df.printSchema()
    print(df.count())
 # Sets a null value to the column if the value is equal to the given value
 def set_class_value_to_null(column, value):
    return F.when(column != value, column).otherwise(F.lit(None))
 # load and filter Project-to-Result relations
 print("Reading relations")
 relations = spark.read.json(relations_fd)\
 			.select(F.col('source').alias('projectId'), F.col('target').alias('resultId'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
 			.where( (F.col('relClass') == 'produces') \
 				& (F.col('deletedbyinference') == "false")\
                & (F.col('invisible') == "false"))\
 			.drop('deletedbyinference')\
 			.drop('invisible')\
            .drop('relClass')\
 			.repartition(num_partitions, 'resultId')
 for indicator_name, fd, column_name in impact_indicators:
    print("Reading {} '{}' field from file".format(indicator_name, column_name))
    df = read_df(fd, indicator_name, column_name)
    # sets a zero value to the indicator column if the value is C5
    if (column_name == 'class'):
        df = df.withColumn(indicator_name, F.when(F.col(indicator_name).isin("C5"), 0).otherwise(1))
    # print_df(df)
    print("Joining {} to relations".format(indicator_name))
    # NOTE: we use inner join because we want to keep only the results that have an impact score
    # also note that all impact scores have the same set of results
    relations = relations.join(df, 'resultId', 'inner')\
        .repartition(num_partitions, 'resultId')
 # uncomment to print non-null values count for each indicator
 # for indicator_name, fd, column_name in impact_indicators:
 #     print("Counting non null values for {}".format(indicator_name))
 #     print(relations.filter(F.col(indicator_name).isNotNull()).count())
 # sum the impact indicator values for each project
 relations.groupBy('projectId')\
    .agg(\
        F.sum('influence').alias('numOfInfluentialResults'),\
        F.sum('popularity').alias('numOfPopularResults'),\
        F.sum('impulse').alias('totalImpulse'),\
        F.sum('citation_count').alias('totalCitationCount')\
    )\
    .write.mode("overwrite")\
    .json(output_dir, compression="gzip")
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -15,6 +15,8 @@
 			<case to="map-openaire-to-doi">${resume eq "map-ids"}</case> 
 			<case to="map-scores-to-dois">${resume eq "map-scores"}</case> 
 			<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
 			<case to="project-impact-indicators">${resume eq "projects-impact"}</case>
 			<!-- TODO: add action set creation here -->
 			<default to="create-openaire-ranking-graph" />
 		</switch>
@ -33,7 +35,6 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -88,9 +89,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
-            		
+			<!-- using configs from an example on openaire -->
-            		<!-- using configs from an example on openaire --> 
+			<master>yarn-cluster</master>
            		<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
@ -130,7 +130,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -179,9 +178,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
-            		
+			<!-- using configs from an example on openaire -->
-            		<!-- using configs from an example on openaire --> 
+			<master>yarn-cluster</master>
            		<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
@ -233,7 +231,7 @@
 			<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn-cluster, yarn-master, or local. -->
 			<!-- <master>local[*]</master> -->
 			<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
-            		<!-- <mode>client</mode> --> 
+			<!-- <mode>client</mode> -->
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
@ -334,12 +332,12 @@
 			<!-- This should give the machine/root of the hdfs -->
 			<name-node>${nameNode}</name-node>
-            		<!-- Exec is needed foor shell comands - points to type of shell command -->
+			<!-- Exec is needed for shell commands - points to type of shell command -->
-            		<exec>/usr/bin/bash</exec>
+			<exec>/usr/bin/bash</exec>
-            		<!-- name of script to run -->
+			<!-- name of script to run -->
-            		<argument>get_ranking_files.sh</argument>
+			<argument>get_ranking_files.sh</argument>
-            		<!-- We only pass the directory where we expect to find the rankings -->
+			<!-- We only pass the directory where we expect to find the rankings -->
-            		<argument>/${workflowDataDir}</argument>
+			<argument>/${workflowDataDir}</argument>
 			<!-- the name of the file run -->
 			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
@ -372,8 +370,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
-            		<!-- using configs from an example on openaire --> 
+			<!-- using configs from an example on openaire -->
-            		<master>yarn-cluster</master>
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
@ -420,8 +418,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
-            		<!-- using configs from an example on openaire --> 
+			<!-- using configs from an example on openaire -->
-            		<master>yarn-cluster</master>
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
@ -475,7 +473,6 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -518,7 +515,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -558,21 +554,19 @@
 	</action>	
-	    <action name="deleteOutputPathForActionSet">
+	<action name="deleteOutputPathForActionSet">
        <fs>
-            <delete path="${actionSetOutputPath}"/>
+            <delete path="${actionSetOutputPath}/results/"/>
-            <mkdir path="${actionSetOutputPath}"/>
+			<delete path="${actionSetOutputPath}/projects/"/>
-            <!--
+
-			<delete path="${workingDir}"/>
+			<mkdir path="${actionSetOutputPath}/results/"/>
-            <mkdir path="${workingDir}"/>
+			<mkdir path="${actionSetOutputPath}/projects/"/>
-			--> 
+		</fs>
-        </fs>
+        <ok to="createActionSetForResults"/>
        <ok to="createActionSet"/>
        <error to="actionset-delete-fail"/>
    </action>
-
+    <action name="createActionSetForResults">
    <action name="createActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
@ -590,13 +584,90 @@
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${bipScorePath}</arg>
-            <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+            <arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
-        </spark>
+			<arg>--targetEntity</arg><arg>result</arg>
-        <ok to="end"/>
+		</spark>
        <ok to="project-impact-indicators"/>
        <error to="actionset-creation-fail"/>
    </action>
 	<action name="project-impact-indicators">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
 			<job-tracker>${jobTracker}</job-tracker>
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<!-- This is the name of our job -->
 			<name>Project Impact Indicators</name>
 			<!-- Script name goes here -->
 			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
 				--master yarn
 				--deploy-mode cluster
 				--conf spark.sql.shuffle.partitions=7680
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
 			<!-- Script arguments here -->
 			<!-- graph data folder from which to read relations -->
 			<arg>${openaireDataInput}/relations</arg>
 			<!-- input files with impact indicators for results	-->
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>7680</arg>
 			<arg>${projectImpactIndicatorsOutput}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
 		</spark>
 		<!-- Do this after finishing okay -->
 		<ok to="createActionSetForProjects" />
 		<!-- Go there if we have an error -->
 		<error to="project-impact-indicators-fail" />
 	</action>
 	<action name="createActionSetForProjects">
 		<spark xmlns="uri:oozie:spark-action:0.2">
 			<master>yarn</master>
 			<mode>cluster</mode>
 			<name>Produces the atomic action with the bip finder scores for projects</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
 			<jar>dhp-aggregation-${projectVersion}.jar</jar>
 			<spark-opts>
 				--executor-memory=${sparkExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkDriverMemory}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
 			</spark-opts>
 			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
 			<arg>--outputPath</arg><arg>${actionSetOutputPath}/projects/</arg>
 			<arg>--targetEntity</arg><arg>project</arg>
 		</spark>
 		<ok to="end"/>
 		<error to="actionset-project-creation-fail"/>
 	</action>
 	<!-- TODO: end the workflow-->
@ -641,7 +712,14 @@
 	</kill>	
 	<kill name="actionset-creation-fail">
-		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+		<message>ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="project-impact-indicators-fail">
 		<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 	<kill name="actionset-project-creation-fail">
 		<message>ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 </workflow-app>
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -38,6 +38,7 @@
        <module>dhp-usage-raw-data-update</module>
        <module>dhp-broker-events</module>
        <module>dhp-doiboost</module>
        <module>dhp-impact-indicators</module>
    </modules>
    <pluginRepositories>