Merge branch '8172_impact_indicators_workflow' of https://code-repo.d4science.org/D-Net/dnet-hadoop into 8172_impact_indicators_workflow

2023-04-28 13:23:49 +03:00 · 2023-04-28 13:23:49 +03:00 · a98da54896
parent 09485fbee3 614cc1089b
commit a98da54896
19 changed files with 474 additions and 98 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -9,6 +9,7 @@ import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;

+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@ -40,7 +41,8 @@ import scala.Tuple2;
 */
 public class SparkAtomicActionScoreJob implements Serializable {

-	private static final String DOI = "doi";
+	private static final String RESULT = "result";
+	private static final String PROJECT = "project";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

@ -56,18 +58,17 @@ public class SparkAtomicActionScoreJob implements Serializable {

 		parser.parseArgument(args);

-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-
+		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);

 		final String inputPath = parser.get("inputPath");
-		log.info("inputPath {}: ", inputPath);
+		log.info("inputPath: {}", inputPath);

 		final String outputPath = parser.get("outputPath");
-		log.info("outputPath {}: ", outputPath);
+		log.info("outputPath: {}", outputPath);
+
+		final String targetEntity = parser.get("targetEntity");
+		log.info("targetEntity: {}", targetEntity);

 		SparkConf conf = new SparkConf();

@ -76,17 +77,48 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
+
+				// follow different procedures for different target entities
+				switch (targetEntity) {
+					case RESULT:
 						prepareResults(spark, inputPath, outputPath);
-			});
+						break;
+					case PROJECT:
+						prepareProjects(spark, inputPath, outputPath);
+						break;
+					default:
+						throw new RuntimeException("Unknown target entity: " + targetEntity);
+				}
+			}
+		);
+	}
+
+	private static <I extends Project> void prepareProjects(SparkSession spark, String inputPath, String outputPath) {
+
+		// read input bip project scores
+		Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
+
+		projectScores.map( (MapFunction<BipProjectModel, Project>) bipProjectScores -> {
+			Project project = new Project();
+			project.setId(bipProjectScores.getProjectId());
+			project.setMeasures(bipProjectScores.toMeasures());
+			return project;
+		}, Encoders.bean(Project.class))
+		.toJavaRDD()
+		.map(p -> new AtomicAction(Project.class, p))
+		.mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+				new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+		.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+
 	}

 	private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(bipScorePath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));

 		Dataset<BipScore> bipScores = spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
@ -159,12 +191,4 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}

-	public static <R> Dataset<R> readPath(
-		SparkSession spark, String inputPath, Class<R> clazz) {
-		return spark
-			.read()
-			.textFile(inputPath)
-			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
-	}
-
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
@ -0,0 +1,69 @@
+package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
+
+import com.opencsv.bean.CsvBindByPosition;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+import eu.dnetlib.dhp.schema.oaf.Measure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static eu.dnetlib.dhp.actionmanager.Constants.*;
+
+@NoArgsConstructor
+@AllArgsConstructor
+@Getter
+@Setter
+public class BipProjectModel {
+    String projectId;
+
+    String numOfInfluentialResults;
+
+    String numOfPopularResults;
+
+    String totalImpulse;
+
+    String totalCitationCount;
+
+    // each project bip measure has exactly one value, hence one key-value pair
+    private Measure createMeasure(String measureId, String measureValue) {
+
+        KeyValue kv = new KeyValue();
+        kv.setKey("score");
+        kv.setValue(measureValue);
+        kv.setDataInfo(
+            OafMapperUtils.dataInfo(
+                false,
+                UPDATE_DATA_INFO_TYPE,
+                true,
+                false,
+                OafMapperUtils.qualifier(
+                    UPDATE_MEASURE_BIP_CLASS_ID,
+                    UPDATE_CLASS_NAME,
+                    ModelConstants.DNET_PROVENANCE_ACTIONS,
+                    ModelConstants.DNET_PROVENANCE_ACTIONS),
+    "")
+        );
+
+        Measure measure = new Measure();
+        measure.setId(measureId);
+        measure.setUnit(Collections.singletonList(kv));
+        return measure;
+    }
+    public List<Measure> toMeasures() {
+        return Arrays.asList(
+            createMeasure("numOfInfluentialResults", numOfInfluentialResults),
+            createMeasure("numOfPopularResults", numOfPopularResults),
+            createMeasure("totalImpulse", totalImpulse),
+            createMeasure("totalCitationCount", totalCitationCount)
+        );
+    }
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
@ -1,5 +1,7 @@

-package eu.dnetlib.dhp.actionmanager.bipmodel;
+package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
+
+import eu.dnetlib.dhp.actionmanager.bipmodel.Score;

 import java.io.Serializable;
 import java.util.ArrayList;
@ -11,9 +13,9 @@ import java.util.List;
 * Only needed for deserialization purposes
 */

-public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
+public class BipResultModel extends HashMap<String, List<Score>> implements Serializable {

-	public BipDeserialize() {
+	public BipResultModel() {
 		super();
 	}

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@ -24,7 +24,7 @@ import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@ -82,9 +82,9 @@ public class PrepareBipFinder implements Serializable {

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(inputPath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));

 		spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@ -16,5 +16,11 @@
    "paramLongName": "outputPath",
    "paramDescription": "the path of the new ActionSet",
    "paramRequired": true
+  },
+  {
+    "paramName": "te",
+    "paramLongName": "targetEntity",
+    "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }",
+    "paramRequired": true
  }
 ]
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -6,8 +6,9 @@ import static org.junit.jupiter.api.Assertions.*;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.List;

+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Project;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@ -27,7 +28,6 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.schema.action.AtomicAction;
-import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;

 public class SparkAtomicActionScoreJobTest {
@ -37,8 +37,11 @@ public class SparkAtomicActionScoreJobTest {
 	private static SparkSession spark;

 	private static Path workingDir;
-	private static final Logger log = LoggerFactory
-		.getLogger(SparkAtomicActionScoreJobTest.class);
+
+	private final static String RESULT = "result";
+	private final static String PROJECT = "project";
+
+	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class);

 	@BeforeAll
 	public static void beforeAll() throws IOException {
@ -69,29 +72,31 @@ public class SparkAtomicActionScoreJobTest {
 		spark.stop();
 	}

-	@Test
-	void testMatch() throws Exception {
-		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
-			.getPath();
-
-		SparkAtomicActionScoreJob
-			.main(
+	private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception {
+		SparkAtomicActionScoreJob.main(
 			new String[] {
-					"-isSparkSessionManaged",
-					Boolean.FALSE.toString(),
-					"-inputPath",
+					"-isSparkSessionManaged", Boolean.FALSE.toString(),
+					"-inputPath", inputPath,
+					"-outputPath", outputPath,
+					"-targetEntity", targetEntity,
+			}
+		);
+	}
+	@Test
+	void testResultScores() throws Exception {
+		final String targetEntity = RESULT;
+		String inputResultScores = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json")
+			.getPath();
+		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";

-					bipScoresPath,
-
-					"-outputPath",
-					workingDir.toString() + "/actionSet"
-				});
+		// execute the job to generate the action sets for result scores
+		runJob(inputResultScores, outputPath, targetEntity);

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

 		JavaRDD<Result> tmp = sc
-			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.sequenceFile(outputPath, Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Result) aa.getPayload()));

@ -140,4 +145,61 @@ public class SparkAtomicActionScoreJobTest {

 	}

+	@Test
+	void testProjectScores() throws Exception {
+		String targetEntity = PROJECT;
+		String inputResultScores = getClass()
+				.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
+				.getPath();
+		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
+
+		// execute the job to generate the action sets for project scores
+		runJob(inputResultScores, outputPath, PROJECT);
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Project> projects = sc
+				.sequenceFile(outputPath, Text.class, Text.class)
+				.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+				.map(aa -> ((Project) aa.getPayload()));
+
+		// test the number of projects
+		assertEquals(4, projects.count());
+
+		String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833";
+
+		// count that the project with id testProjectId is present
+		assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count());
+
+		projects.filter(row -> row.getId().equals(testProjectId))
+			.flatMap(r -> r.getMeasures().iterator())
+			.foreach(m -> {
+				log.info(m.getId() + " " + m.getUnit());
+
+				// ensure that only one score is present for each bip impact measure
+				assertEquals(1, m.getUnit().size());
+
+				KeyValue kv = m.getUnit().get(0);
+
+				// ensure that the correct key is provided, i.e. score
+				assertEquals("score", kv.getKey());
+
+				switch(m.getId()) {
+					case "numOfInfluentialResults":
+						assertEquals("0", kv.getValue());
+						break;
+					case "numOfPopularResults":
+						assertEquals("1", kv.getValue());
+						break;
+					case "totalImpulse":
+						assertEquals("25", kv.getValue());
+						break;
+					case "totalCitationCount":
+						assertEquals("43", kv.getValue());
+						break;
+					default:
+						fail("Unknown measure id in the context of projects");
+				}
+			});
+	}
 }
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
@ -0,0 +1,4 @@
+{"projectId":"40|nsf_________::d93e50d22374a1cf59f6a232413ea027","numOfInfluentialResults":0,"numOfPopularResults":10,"totalImpulse":181,"totalCitationCount":235}
+{"projectId":"40|nih_________::1c93debc7085e440f245fbe70b2e8b21","numOfInfluentialResults":14,"numOfPopularResults":17,"totalImpulse":1558,"totalCitationCount":4226}
+{"projectId":"40|nih_________::c02a8233e9b60f05bb418f0c9b714833","numOfInfluentialResults":0,"numOfPopularResults":1,"totalImpulse":25,"totalCitationCount":43}
+{"projectId":"40|corda_______::d91dcf3a87dd7f72248fab0b8a4ba273","numOfInfluentialResults":2,"numOfPopularResults":3,"totalImpulse":78,"totalCitationCount":178}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
--- a/dhp-workflows/dhp-impact-indicators/README.md
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@ -1,4 +1,4 @@
-# Ranking Workflow for Openaire Publications
+# Ranking Workflow for OpenAIRE Publications

 This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie.
 All scripts are written in python and the project setup follows the typical oozie workflow structure:
@ -7,17 +7,15 @@ All scripts are written in python and the project setup follows the typical oozi
 - a job.properties file specifying parameter values for the parameters used by the workflow
 - a set of python scripts used by the workflow

-**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker.
+**NOTE**: the workflow depends on the external library of ranking scripts called [BiP! Ranker](https://github.com/athenarc/Bip-Ranker).
 You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section.

-## Check out a specific tag/release of BIP-Ranker
+## Build and deploy

-* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
-
-* Then, use maven to perform the checkout:
+Use the following command for packaging:

 ```
-mvn scm:checkout
+mvn package  -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
 ```

-* The code should be visible under `src/main/bip-ranker` folder.
+Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.
--- a/dhp-workflows/dhp-impact-indicators/pom.xml
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@ -5,9 +5,8 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>eu.dnetlib.dhp</groupId>
-        <artifactId>dhp</artifactId>
+        <artifactId>dhp-workflows</artifactId>
        <version>1.2.5-SNAPSHOT</version>
-        <relativePath>../pom.xml</relativePath>
    </parent>

    <artifactId>dhp-impact-indicators</artifactId>
@ -16,6 +15,9 @@
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+
+        <!--   Use this property to fetch a specific tag      -->
+        <bip.ranker.tag>v1.0.0</bip.ranker.tag>
    </properties>

    <scm>
@ -32,10 +34,29 @@
                <configuration>
                    <connectionType>connection</connectionType>
                    <scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
-                    <scmVersion>v1.0.0</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
-                    <checkoutDirectory>${project.build.directory}/../src/main/bip-ranker</checkoutDirectory>
+                    <scmVersion>${bip.ranker.tag}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
+                    <checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/bip-ranker</checkoutDirectory>
                </configuration>
+                <executions>
+                    <execution>
+                        <id>checkout-bip-ranker</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>checkout</goal>
+                        </goals>
+                    </execution>
+                </executions>
            </plugin>
        </plugins>
    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-aggregation</artifactId>
+            <version>${projectVersion}</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+
 </project>
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@ -90,3 +90,6 @@ oozie.wf.application.path=${wfAppPath}
 # Path where the final output should be?
 actionSetOutputPath=${workflowDataDir}/bip_actionsets/

+# The directory to store project impact indicators
+projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators
+
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
@ -0,0 +1,108 @@
+import sys
+from pyspark.sql import SparkSession
+from pyspark import SparkConf, SparkContext
+import pyspark.sql.functions as F
+from pyspark.sql.types import StringType, IntegerType, StructType, StructField
+
+if len(sys.argv) < 8:
+    print("Usage: projects_impact.py <relations_folder> <influence_file> <popularity_file> <cc_file> <impulse_file> <num_partitions> <output_dir>")
+    sys.exit(-1)
+
+appName = 'Project Impact Indicators'
+conf = SparkConf().setAppName(appName)
+sc = SparkContext(conf = conf)
+spark = SparkSession.builder.appName(appName).getOrCreate()
+sc.setLogLevel('OFF')
+
+# input parameters
+relations_fd = sys.argv[1]
+influence_fd = sys.argv[2]
+popularity_fd = sys.argv[3]
+cc_fd = sys.argv[4]
+impulse_fd = sys.argv[5]
+num_partitions = int(sys.argv[6])
+output_dir = sys.argv[7]
+
+# schema for impact indicator files
+impact_files_schema = StructType([
+    StructField('resultId', StringType(), False),
+    StructField('score', IntegerType(), False),
+    StructField('class', StringType(), False),
+])
+
+# list of impact indicators
+impact_indicators = [
+    ('influence', influence_fd, 'class'),
+    ('popularity', popularity_fd, 'class'),
+    ('impulse', impulse_fd, 'score'),
+    ('citation_count', cc_fd, 'score')
+]
+
+'''
+    * Read impact indicator file and return a dataframe with the following schema:
+    *   resultId: String
+    *   indicator_name: Integer
+'''
+def read_df(fd, indicator_name, column_name):
+    return spark.read.schema(impact_files_schema)\
+        .option('delimiter', '\t')\
+        .option('header', False)\
+        .csv(fd)\
+        .select('resultId', F.col(column_name).alias(indicator_name))\
+        .repartition(num_partitions, 'resultId')
+
+# Print dataframe schema, first 5 rows, and count
+def print_df(df):
+    df.show(50)
+    df.printSchema()
+    print(df.count())
+
+# Sets a null value to the column if the value is equal to the given value
+def set_class_value_to_null(column, value):
+    return F.when(column != value, column).otherwise(F.lit(None))
+
+# load and filter Project-to-Result relations
+print("Reading relations")
+relations = spark.read.json(relations_fd)\
+			.select(F.col('source').alias('projectId'), F.col('target').alias('resultId'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
+			.where( (F.col('relClass') == 'produces') \
+				& (F.col('deletedbyinference') == "false")\
+                & (F.col('invisible') == "false"))\
+			.drop('deletedbyinference')\
+			.drop('invisible')\
+            .drop('relClass')\
+			.repartition(num_partitions, 'resultId')
+
+for indicator_name, fd, column_name in impact_indicators:
+
+    print("Reading {} '{}' field from file".format(indicator_name, column_name))
+    df = read_df(fd, indicator_name, column_name)
+
+    # sets a zero value to the indicator column if the value is C5
+    if (column_name == 'class'):
+        df = df.withColumn(indicator_name, F.when(F.col(indicator_name).isin("C5"), 0).otherwise(1))
+
+    # print_df(df)
+
+    print("Joining {} to relations".format(indicator_name))
+
+    # NOTE: we use inner join because we want to keep only the results that have an impact score
+    # also note that all impact scores have the same set of results
+    relations = relations.join(df, 'resultId', 'inner')\
+        .repartition(num_partitions, 'resultId')
+
+# uncomment to print non-null values count for each indicator
+# for indicator_name, fd, column_name in impact_indicators:
+#     print("Counting non null values for {}".format(indicator_name))
+#     print(relations.filter(F.col(indicator_name).isNotNull()).count())
+
+# sum the impact indicator values for each project
+relations.groupBy('projectId')\
+    .agg(\
+        F.sum('influence').alias('numOfInfluentialResults'),\
+        F.sum('popularity').alias('numOfPopularResults'),\
+        F.sum('impulse').alias('totalImpulse'),\
+        F.sum('citation_count').alias('totalCitationCount')\
+    )\
+    .write.mode("overwrite")\
+    .json(output_dir, compression="gzip")
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@ -15,6 +15,8 @@
 			<case to="map-openaire-to-doi">${resume eq "map-ids"}</case> 
 			<case to="map-scores-to-dois">${resume eq "map-scores"}</case> 
 			<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
+			<case to="project-impact-indicators">${resume eq "projects-impact"}</case>
+
 			<!-- TODO: add action set creation here -->
 			<default to="create-openaire-ranking-graph" />
 		</switch>
@ -33,7 +35,6 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
            		
-            		
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -88,7 +89,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>

-            		
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -130,7 +130,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
            		
-            		
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -179,7 +178,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>

-            		
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -334,7 +332,7 @@
 			<!-- This should give the machine/root of the hdfs -->
 			<name-node>${nameNode}</name-node>
            		
-            		<!-- Exec is needed foor shell comands - points to type of shell command -->
+			<!-- Exec is needed for shell commands - points to type of shell command -->
 			<exec>/usr/bin/bash</exec>
 			<!-- name of script to run -->
 			<argument>get_ranking_files.sh</argument>
@ -475,7 +473,6 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
            		
-            		
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -518,7 +515,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>

-            		
            <!-- using configs from an example on openaire --> 
            <master>yarn-cluster</master>
 			<mode>cluster</mode>
@ -560,19 +556,17 @@

 	<action name="deleteOutputPathForActionSet">
        <fs>
-            <delete path="${actionSetOutputPath}"/>
-            <mkdir path="${actionSetOutputPath}"/>
-            <!--
-			<delete path="${workingDir}"/>
-            <mkdir path="${workingDir}"/>
-			--> 
+            <delete path="${actionSetOutputPath}/results/"/>
+			<delete path="${actionSetOutputPath}/projects/"/>
+
+			<mkdir path="${actionSetOutputPath}/results/"/>
+			<mkdir path="${actionSetOutputPath}/projects/"/>
 		</fs>
-        <ok to="createActionSet"/>
+        <ok to="createActionSetForResults"/>
        <error to="actionset-delete-fail"/>
    </action>

-
-    <action name="createActionSet">
+    <action name="createActionSetForResults">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
@ -590,13 +584,90 @@
                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
            </spark-opts>
            <arg>--inputPath</arg><arg>${bipScorePath}</arg>
-            <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+            <arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
+			<arg>--targetEntity</arg><arg>result</arg>
 		</spark>
-        <ok to="end"/>
+        <ok to="project-impact-indicators"/>
        <error to="actionset-creation-fail"/>
    </action>

+	<action name="project-impact-indicators">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
+			<mode>cluster</mode>

+			<!-- This is the name of our job -->
+			<name>Project Impact Indicators</name>
+			<!-- Script name goes here -->
+			<jar>projects_impact.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
+				--master yarn
+				--deploy-mode cluster
+				--conf spark.sql.shuffle.partitions=7680
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+
+			<!-- Script arguments here -->
+
+			<!-- graph data folder from which to read relations -->
+			<arg>${openaireDataInput}/relations</arg>
+
+			<!-- input files with impact indicators for results	-->
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+
+			<arg>${projectImpactIndicatorsOutput}</arg>
+
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
+		</spark>
+
+		<!-- Do this after finishing okay -->
+		<ok to="createActionSetForProjects" />
+
+		<!-- Go there if we have an error -->
+		<error to="project-impact-indicators-fail" />
+
+	</action>
+
+	<action name="createActionSetForProjects">
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<master>yarn</master>
+			<mode>cluster</mode>
+			<name>Produces the atomic action with the bip finder scores for projects</name>
+			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+			<jar>dhp-aggregation-${projectVersion}.jar</jar>
+			<spark-opts>
+				--executor-memory=${sparkExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkDriverMemory}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+			</spark-opts>
+			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
+			<arg>--outputPath</arg><arg>${actionSetOutputPath}/projects/</arg>
+			<arg>--targetEntity</arg><arg>project</arg>
+		</spark>
+		<ok to="end"/>
+		<error to="actionset-project-creation-fail"/>
+	</action>

 	<!-- TODO: end the workflow-->
 		
@ -641,7 +712,14 @@
 	</kill>	

 	<kill name="actionset-creation-fail">
-		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+		<message>ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>

+	<kill name="project-impact-indicators-fail">
+		<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+
+	<kill name="actionset-project-creation-fail">
+		<message>ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
 </workflow-app>
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@ -38,6 +38,7 @@
        <module>dhp-usage-raw-data-update</module>
        <module>dhp-broker-events</module>
        <module>dhp-doiboost</module>
+        <module>dhp-impact-indicators</module>
    </modules>

    <pluginRepositories>