2020-10-05 14:14:39 +02:00
5 changed files with 109 additions and 25 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
@ -23,6 +23,74 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import scala.Tuple2;
 /**
 * Among all the programmes provided in the csv file, selects those in H2020 framework that have an english title.
 *
 * The title is then handled to get the programme description at a certain level. The set of programme titles will then
 * be used to associate a classification for the programme.
 *
 * The programme code describes an hierarchy that can be exploited to provide the classification. To determine the hierarchy
 * the code can be split by '.'. If the length of the splitted code is less than or equal to 2 it can be directly used
 * as the classification: H2020-EU -> Horizon 2020 Framework Programme (It will never be repeated),
 * H2020-EU.1. -> Excellent science, H2020-EU.2. -> Industrial leadership etc.
 *
 * The codes are ordered and for all of them the concatenation of all the titles (from the element in position 1 of
 * the splitted code) handled as below is used to create the classification. For example:
 *
 *   H2020-EU.1.1       -> Excellent science | European Research Council (ERC)
 *   from H2020-EU.1. -> Excellence science and H2020-EU.1.1. -> European Research Council (ERC)
 *
 *   H2020-EU.3.1.3.1. -> Societal challenges | Health, demographic change and well-being | Treating and managing disease | Treating disease, including developing regenerative medicine
 *   from H2020-EU.3.       -> Societal challenges,
 *        H2020-EU.3.1.     -> Health, demographic change and well-being
 *        H2020-EU.3.1.3    -> Treating and managing disease
 *        H2020-EU.3.1.3.1. -> Treating disease, including developing regenerative medicine
 *
 * The classification up to level three, will be split in dedicated variables, while the complete classification will be stored
 * in a variable called classification and provided as shown above.
 *
 * The programme title is not give in a standardized way:
 *
 *  - Sometimes associated to the higher level in the hierarchy we can find Priority in title other times it is not the
 *    case. Since it is not uniform, we removed priority from the handled titles:
 *
 *    H2020-EU.1. -> PRIORITY 'Excellent science'
 *    H2020-EU.2. -> PRIORITY 'Industrial leadership'
 *    H2020-EU.3. -> PRIORITY 'Societal challenges
 *
 *    will become
 *
 *    H2020-EU.1. -> Excellent science
 *    H2020-EU.2. -> Industrial leadership
 *    H2020-EU.3. -> Societal challenges
 *
 *  - Sometimes the title of the parent is repeated in the title for the code, but it is not always the case, so, titles
 *    associated to previous levels in the hierarchy are removed from the code title.
 *
 *	  H2020-EU.1.2. -> EXCELLENT SCIENCE - Future and Emerging Technologies (FET)
 *	  H2020-EU.2.2. -> INDUSTRIAL LEADERSHIP - Access to risk finance
 *    H2020-EU.3.4. -> SOCIETAL CHALLENGES - Smart, Green And Integrated Transport
 *
 *    will become
 *
 *    H2020-EU.1.2. -> Future and Emerging Technologies (FET)
 *    H2020-EU.2.2. -> Access to risk finance
 *    H2020-EU.3.4. -> Smart, Green And Integrated Transport
 *
 *    This holds at all levels in the hierarchy. Hence
 *
 *    H2020-EU.2.1.2. -> INDUSTRIAL LEADERSHIP - Leadership in enabling and industrial technologies – Nanotechnologies
 *
 *    will become
 *
 *    H2020-EU.2.1.2. -> Nanotechnologies
 *
 *  - Euratom is not given in the way the other programmes are: H2020-EU. but H2020-Euratom- . So we need to write
 *    specific code for it
 *
 *
 *
 */
 public class PrepareProgramme {
 	private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
@ -107,6 +175,17 @@ public class PrepareProgramme {
 				return csvProgramme;
 			});
 		prepareClassification(h2020Programmes);
 		h2020Programmes.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
 				.saveAsTextFile(outputPath);
 	}
 	private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
 		Object[] codedescription = h2020Programmes
 			.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
 			.collect()
@ -174,16 +253,13 @@ public class PrepareProgramme {
 			}
 		}
-
+		h2020Programmes.foreach(csvProgramme -> {
 		h2020Programmes.map(csvProgramme -> {
 			if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
 					&& !csvProgramme.getCode().equals("H2020-EC"))
 				csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
 			else
 				csvProgramme.setClassification(map.get(csvProgramme.getCode()));
-			return OBJECT_MAPPER.writeValueAsString(csvProgramme);
+		});
 		}).saveAsTextFile(outputPath);
 	}
 	public static <R> Dataset<R> readPath(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
@ -24,6 +24,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import scala.Tuple2;
 /**
 * Selects only the relevant information collected with the projects: project grant agreement, project programme code and
 * project topic code for the projects that are also collected from OpenAIRE.
 */
 public class PrepareProjects {
 	private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java
@ -3,11 +3,13 @@ package eu.dnetlib.dhp.actionmanager.project;
 import java.io.Serializable;
 /**
 * Class to store the grande agreement (code) of the collected projects
 */
 public class ProjectSubset implements Serializable {
 	private String code;
-	private String topiccode;
+
 	private String topicdescription;
 	public String getCode() {
 		return code;
@ -17,19 +19,4 @@ public class ProjectSubset implements Serializable {
 		this.code = code;
 	}
 	public String getTopiccode() {
 		return topiccode;
 	}
 	public void setTopiccode(String topiccode) {
 		this.topiccode = topiccode;
 	}
 	public String getTopicdescription() {
 		return topicdescription;
 	}
 	public void setTopicdescription(String topicdescription) {
 		this.topicdescription = topicdescription;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java
@ -25,6 +25,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.DbClient;
 /**
 * queries the OpenAIRE database to get the grant agreement of projects collected from corda__h2020. The code collected
 * are written on hdfs using the ProjectSubset model
 */
 public class ReadProjectsFromDB implements Closeable {
 	private final DbClient dbClient;
@ -72,8 +76,6 @@ public class ReadProjectsFromDB implements Closeable {
 		try {
 			ProjectSubset p = new ProjectSubset();
 			p.setCode(rs.getString("code"));
 //			p.setTopiccode(rs.getString("optional1"));
 //			p.setTopicdescription(rs.getString("optional2"));
 			return Arrays.asList(p);
 		} catch (final Exception e) {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
@ -36,6 +36,21 @@ import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import scala.Tuple2;
 /**
 * Class that makes the ActionSet. To prepare the AS two joins are needed
 *
 *  1. join betweem the collected project subset and the programme extenden with the classification on the grant agreement.
 *     For each entry a
 *     eu.dnetlib.dhp.Project entity is created and the information about H2020Classification is set together with the
 *     h2020topiccode variable
 *  2. join between the output of the previous step and the topic information on the topic code. Each time a match is
 *     found the h2020topicdescription variable is set.
 *
 * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more
 * than one programme.
 *
 *
 */
 public class SparkAtomicActionJob {
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();