adding short description

2020-10-05 13:54:39 +02:00 · 2020-10-05 13:54:39 +02:00 · 061527f06e
parent 0c12d7bdd8
commit 061527f06e
5 changed files with 109 additions and 25 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java
@ -23,6 +23,74 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import scala.Tuple2;

+/**
+ * Among all the programmes provided in the csv file, selects those in H2020 framework that have an english title.
+ *
+ * The title is then handled to get the programme description at a certain level. The set of programme titles will then
+ * be used to associate a classification for the programme.
+ *
+ * The programme code describes an hierarchy that can be exploited to provide the classification. To determine the hierarchy
+ * the code can be split by '.'. If the length of the splitted code is less than or equal to 2 it can be directly used
+ * as the classification: H2020-EU -> Horizon 2020 Framework Programme (It will never be repeated),
+ * H2020-EU.1. -> Excellent science, H2020-EU.2. -> Industrial leadership etc.
+ *
+ * The codes are ordered and for all of them the concatenation of all the titles (from the element in position 1 of
+ * the splitted code) handled as below is used to create the classification. For example:
+ *
+ *   H2020-EU.1.1       -> Excellent science | European Research Council (ERC)
+ *   from H2020-EU.1. -> Excellence science and H2020-EU.1.1. -> European Research Council (ERC)
+ *
+ *   H2020-EU.3.1.3.1. -> Societal challenges | Health, demographic change and well-being | Treating and managing disease | Treating disease, including developing regenerative medicine
+ *   from H2020-EU.3.       -> Societal challenges,
+ *        H2020-EU.3.1.     -> Health, demographic change and well-being
+ *        H2020-EU.3.1.3    -> Treating and managing disease
+ *        H2020-EU.3.1.3.1. -> Treating disease, including developing regenerative medicine
+ *
+ * The classification up to level three, will be split in dedicated variables, while the complete classification will be stored
+ * in a variable called classification and provided as shown above.
+ *
+ * The programme title is not give in a standardized way:
+ *
+ *  - Sometimes associated to the higher level in the hierarchy we can find Priority in title other times it is not the
+ *    case. Since it is not uniform, we removed priority from the handled titles:
+ *
+ *    H2020-EU.1. -> PRIORITY 'Excellent science'
+ *    H2020-EU.2. -> PRIORITY 'Industrial leadership'
+ *    H2020-EU.3. -> PRIORITY 'Societal challenges
+ *
+ *    will become
+ *
+ *    H2020-EU.1. -> Excellent science
+ *    H2020-EU.2. -> Industrial leadership
+ *    H2020-EU.3. -> Societal challenges
+ *
+ *  - Sometimes the title of the parent is repeated in the title for the code, but it is not always the case, so, titles
+ *    associated to previous levels in the hierarchy are removed from the code title.
+ *
+ *	  H2020-EU.1.2. -> EXCELLENT SCIENCE - Future and Emerging Technologies (FET)
+ *	  H2020-EU.2.2. -> INDUSTRIAL LEADERSHIP - Access to risk finance
+ *    H2020-EU.3.4. -> SOCIETAL CHALLENGES - Smart, Green And Integrated Transport
+ *
+ *    will become
+ *
+ *    H2020-EU.1.2. -> Future and Emerging Technologies (FET)
+ *    H2020-EU.2.2. -> Access to risk finance
+ *    H2020-EU.3.4. -> Smart, Green And Integrated Transport
+ *
+ *    This holds at all levels in the hierarchy. Hence
+ *
+ *    H2020-EU.2.1.2. -> INDUSTRIAL LEADERSHIP - Leadership in enabling and industrial technologies – Nanotechnologies
+ *
+ *    will become
+ *
+ *    H2020-EU.2.1.2. -> Nanotechnologies
+ *
+ *  - Euratom is not given in the way the other programmes are: H2020-EU. but H2020-Euratom- . So we need to write
+ *    specific code for it
+ *
+ *
+ *
+ */
 public class PrepareProgramme {

 	private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
@ -107,6 +175,17 @@ public class PrepareProgramme {
 				return csvProgramme;
 			});

+		prepareClassification(h2020Programmes);
+
+		h2020Programmes.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
+				.saveAsTextFile(outputPath);
+
+
+
+	}
+
+
+	private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
 		Object[] codedescription = h2020Programmes
 			.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
 			.collect()
@ -174,16 +253,13 @@ public class PrepareProgramme {
 			}

 		}
-
-		h2020Programmes.map(csvProgramme -> {
+		h2020Programmes.foreach(csvProgramme -> {
 			if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
-				&& !csvProgramme.getCode().equals("H2020-EC"))
+					&& !csvProgramme.getCode().equals("H2020-EC"))
 				csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
 			else
 				csvProgramme.setClassification(map.get(csvProgramme.getCode()));
-			return OBJECT_MAPPER.writeValueAsString(csvProgramme);
-		}).saveAsTextFile(outputPath);
-
+		});
 	}

 	public static <R> Dataset<R> readPath(
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java
@ -24,6 +24,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import scala.Tuple2;

+/**
+ * Selects only the relevant information collected with the projects: project grant agreement, project programme code and
+ * project topic code for the projects that are also collected from OpenAIRE.
+ */
 public class PrepareProjects {

 	private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java
@ -3,11 +3,13 @@ package eu.dnetlib.dhp.actionmanager.project;

 import java.io.Serializable;

+/**
+ * Class to store the grande agreement (code) of the collected projects
+ */
 public class ProjectSubset implements Serializable {

 	private String code;
-	private String topiccode;
-	private String topicdescription;
+

 	public String getCode() {
 		return code;
@ -17,19 +19,4 @@ public class ProjectSubset implements Serializable {
 		this.code = code;
 	}

-	public String getTopiccode() {
-		return topiccode;
-	}
-
-	public void setTopiccode(String topiccode) {
-		this.topiccode = topiccode;
-	}
-
-	public String getTopicdescription() {
-		return topicdescription;
-	}
-
-	public void setTopicdescription(String topicdescription) {
-		this.topicdescription = topicdescription;
-	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java
@ -25,6 +25,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.DbClient;

+/**
+ * queries the OpenAIRE database to get the grant agreement of projects collected from corda__h2020. The code collected
+ * are written on hdfs using the ProjectSubset model
+ */
 public class ReadProjectsFromDB implements Closeable {

 	private final DbClient dbClient;
@ -72,8 +76,6 @@ public class ReadProjectsFromDB implements Closeable {
 		try {
 			ProjectSubset p = new ProjectSubset();
 			p.setCode(rs.getString("code"));
-//			p.setTopiccode(rs.getString("optional1"));
-//			p.setTopicdescription(rs.getString("optional2"));
 			return Arrays.asList(p);

 		} catch (final Exception e) {
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java
@ -36,6 +36,21 @@ import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.utils.DHPUtils;
 import scala.Tuple2;

+/**
+ * Class that makes the ActionSet. To prepare the AS two joins are needed
+ *
+ *  1. join betweem the collected project subset and the programme extenden with the classification on the grant agreement.
+ *     For each entry a
+ *     eu.dnetlib.dhp.Project entity is created and the information about H2020Classification is set together with the
+ *     h2020topiccode variable
+ *  2. join between the output of the previous step and the topic information on the topic code. Each time a match is
+ *     found the h2020topicdescription variable is set.
+ *
+ * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more
+ * than one programme.
+ *
+ *
+ */
 public class SparkAtomicActionJob {
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();