From 061527f06e677b75451fd4374b9d2b88b6c85711 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 5 Oct 2020 13:54:39 +0200 Subject: [PATCH] adding short description --- .../project/PrepareProgramme.java | 88 +++++++++++++++++-- .../project/PrepareProjects.java | 4 + .../actionmanager/project/ProjectSubset.java | 21 +---- .../project/ReadProjectsFromDB.java | 6 +- .../project/SparkAtomicActionJob.java | 15 ++++ 5 files changed, 109 insertions(+), 25 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index 54d68e86a..2cf023fb9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -23,6 +23,74 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; +/** + * Among all the programmes provided in the csv file, selects those in H2020 framework that have an english title. + * + * The title is then handled to get the programme description at a certain level. The set of programme titles will then + * be used to associate a classification for the programme. + * + * The programme code describes an hierarchy that can be exploited to provide the classification. To determine the hierarchy + * the code can be split by '.'. If the length of the splitted code is less than or equal to 2 it can be directly used + * as the classification: H2020-EU -> Horizon 2020 Framework Programme (It will never be repeated), + * H2020-EU.1. -> Excellent science, H2020-EU.2. -> Industrial leadership etc. + * + * The codes are ordered and for all of them the concatenation of all the titles (from the element in position 1 of + * the splitted code) handled as below is used to create the classification. For example: + * + * H2020-EU.1.1 -> Excellent science | European Research Council (ERC) + * from H2020-EU.1. -> Excellence science and H2020-EU.1.1. -> European Research Council (ERC) + * + * H2020-EU.3.1.3.1. -> Societal challenges | Health, demographic change and well-being | Treating and managing disease | Treating disease, including developing regenerative medicine + * from H2020-EU.3. -> Societal challenges, + * H2020-EU.3.1. -> Health, demographic change and well-being + * H2020-EU.3.1.3 -> Treating and managing disease + * H2020-EU.3.1.3.1. -> Treating disease, including developing regenerative medicine + * + * The classification up to level three, will be split in dedicated variables, while the complete classification will be stored + * in a variable called classification and provided as shown above. + * + * The programme title is not give in a standardized way: + * + * - Sometimes associated to the higher level in the hierarchy we can find Priority in title other times it is not the + * case. Since it is not uniform, we removed priority from the handled titles: + * + * H2020-EU.1. -> PRIORITY 'Excellent science' + * H2020-EU.2. -> PRIORITY 'Industrial leadership' + * H2020-EU.3. -> PRIORITY 'Societal challenges + * + * will become + * + * H2020-EU.1. -> Excellent science + * H2020-EU.2. -> Industrial leadership + * H2020-EU.3. -> Societal challenges + * + * - Sometimes the title of the parent is repeated in the title for the code, but it is not always the case, so, titles + * associated to previous levels in the hierarchy are removed from the code title. + * + * H2020-EU.1.2. -> EXCELLENT SCIENCE - Future and Emerging Technologies (FET) + * H2020-EU.2.2. -> INDUSTRIAL LEADERSHIP - Access to risk finance + * H2020-EU.3.4. -> SOCIETAL CHALLENGES - Smart, Green And Integrated Transport + * + * will become + * + * H2020-EU.1.2. -> Future and Emerging Technologies (FET) + * H2020-EU.2.2. -> Access to risk finance + * H2020-EU.3.4. -> Smart, Green And Integrated Transport + * + * This holds at all levels in the hierarchy. Hence + * + * H2020-EU.2.1.2. -> INDUSTRIAL LEADERSHIP - Leadership in enabling and industrial technologies – Nanotechnologies + * + * will become + * + * H2020-EU.2.1.2. -> Nanotechnologies + * + * - Euratom is not given in the way the other programmes are: H2020-EU. but H2020-Euratom- . So we need to write + * specific code for it + * + * + * + */ public class PrepareProgramme { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); @@ -107,6 +175,17 @@ public class PrepareProgramme { return csvProgramme; }); + prepareClassification(h2020Programmes); + + h2020Programmes.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme)) + .saveAsTextFile(outputPath); + + + + } + + + private static void prepareClassification(JavaRDD h2020Programmes) { Object[] codedescription = h2020Programmes .map(value -> new Tuple2<>(value.getCode(), value.getTitle())) .collect() @@ -174,16 +253,13 @@ public class PrepareProgramme { } } - - h2020Programmes.map(csvProgramme -> { + h2020Programmes.foreach(csvProgramme -> { if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom") - && !csvProgramme.getCode().equals("H2020-EC")) + && !csvProgramme.getCode().equals("H2020-EC")) csvProgramme.setClassification(map.get(csvProgramme.getCode() + ".")); else csvProgramme.setClassification(map.get(csvProgramme.getCode())); - return OBJECT_MAPPER.writeValueAsString(csvProgramme); - }).saveAsTextFile(outputPath); - + }); } public static Dataset readPath( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java index c53fb11ca..e5cae0ff7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java @@ -24,6 +24,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; +/** + * Selects only the relevant information collected with the projects: project grant agreement, project programme code and + * project topic code for the projects that are also collected from OpenAIRE. + */ public class PrepareProjects { private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java index 03654da13..c51c10876 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ProjectSubset.java @@ -3,11 +3,13 @@ package eu.dnetlib.dhp.actionmanager.project; import java.io.Serializable; +/** + * Class to store the grande agreement (code) of the collected projects + */ public class ProjectSubset implements Serializable { private String code; - private String topiccode; - private String topicdescription; + public String getCode() { return code; @@ -17,19 +19,4 @@ public class ProjectSubset implements Serializable { this.code = code; } - public String getTopiccode() { - return topiccode; - } - - public void setTopiccode(String topiccode) { - this.topiccode = topiccode; - } - - public String getTopicdescription() { - return topicdescription; - } - - public void setTopicdescription(String topicdescription) { - this.topicdescription = topicdescription; - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java index 148f7041a..2bba9fb60 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java @@ -25,6 +25,10 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; +/** + * queries the OpenAIRE database to get the grant agreement of projects collected from corda__h2020. The code collected + * are written on hdfs using the ProjectSubset model + */ public class ReadProjectsFromDB implements Closeable { private final DbClient dbClient; @@ -72,8 +76,6 @@ public class ReadProjectsFromDB implements Closeable { try { ProjectSubset p = new ProjectSubset(); p.setCode(rs.getString("code")); -// p.setTopiccode(rs.getString("optional1")); -// p.setTopicdescription(rs.getString("optional2")); return Arrays.asList(p); } catch (final Exception e) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index 6bf784bbe..f2375e799 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -36,6 +36,21 @@ import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; +/** + * Class that makes the ActionSet. To prepare the AS two joins are needed + * + * 1. join betweem the collected project subset and the programme extenden with the classification on the grant agreement. + * For each entry a + * eu.dnetlib.dhp.Project entity is created and the information about H2020Classification is set together with the + * h2020topiccode variable + * 2. join between the output of the previous step and the topic information on the topic code. Each time a match is + * found the h2020topicdescription variable is set. + * + * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more + * than one programme. + * + * + */ public class SparkAtomicActionJob { private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();