Adding H2020 Classification, topic code and topic description to H2020 projects #46
|
@ -23,6 +23,74 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Among all the programmes provided in the csv file, selects those in H2020 framework that have an english title.
|
||||||
|
*
|
||||||
|
* The title is then handled to get the programme description at a certain level. The set of programme titles will then
|
||||||
|
* be used to associate a classification for the programme.
|
||||||
|
*
|
||||||
|
* The programme code describes an hierarchy that can be exploited to provide the classification. To determine the hierarchy
|
||||||
|
* the code can be split by '.'. If the length of the splitted code is less than or equal to 2 it can be directly used
|
||||||
|
* as the classification: H2020-EU -> Horizon 2020 Framework Programme (It will never be repeated),
|
||||||
|
* H2020-EU.1. -> Excellent science, H2020-EU.2. -> Industrial leadership etc.
|
||||||
|
*
|
||||||
|
* The codes are ordered and for all of them the concatenation of all the titles (from the element in position 1 of
|
||||||
|
* the splitted code) handled as below is used to create the classification. For example:
|
||||||
|
*
|
||||||
|
* H2020-EU.1.1 -> Excellent science | European Research Council (ERC)
|
||||||
|
* from H2020-EU.1. -> Excellence science and H2020-EU.1.1. -> European Research Council (ERC)
|
||||||
|
*
|
||||||
|
* H2020-EU.3.1.3.1. -> Societal challenges | Health, demographic change and well-being | Treating and managing disease | Treating disease, including developing regenerative medicine
|
||||||
|
* from H2020-EU.3. -> Societal challenges,
|
||||||
|
* H2020-EU.3.1. -> Health, demographic change and well-being
|
||||||
|
* H2020-EU.3.1.3 -> Treating and managing disease
|
||||||
|
* H2020-EU.3.1.3.1. -> Treating disease, including developing regenerative medicine
|
||||||
|
*
|
||||||
|
* The classification up to level three, will be split in dedicated variables, while the complete classification will be stored
|
||||||
|
* in a variable called classification and provided as shown above.
|
||||||
|
*
|
||||||
|
* The programme title is not give in a standardized way:
|
||||||
|
*
|
||||||
|
* - Sometimes associated to the higher level in the hierarchy we can find Priority in title other times it is not the
|
||||||
|
* case. Since it is not uniform, we removed priority from the handled titles:
|
||||||
|
*
|
||||||
|
* H2020-EU.1. -> PRIORITY 'Excellent science'
|
||||||
|
* H2020-EU.2. -> PRIORITY 'Industrial leadership'
|
||||||
|
* H2020-EU.3. -> PRIORITY 'Societal challenges
|
||||||
|
*
|
||||||
|
* will become
|
||||||
|
*
|
||||||
|
* H2020-EU.1. -> Excellent science
|
||||||
|
* H2020-EU.2. -> Industrial leadership
|
||||||
|
* H2020-EU.3. -> Societal challenges
|
||||||
|
*
|
||||||
|
* - Sometimes the title of the parent is repeated in the title for the code, but it is not always the case, so, titles
|
||||||
|
* associated to previous levels in the hierarchy are removed from the code title.
|
||||||
|
*
|
||||||
|
* H2020-EU.1.2. -> EXCELLENT SCIENCE - Future and Emerging Technologies (FET)
|
||||||
|
* H2020-EU.2.2. -> INDUSTRIAL LEADERSHIP - Access to risk finance
|
||||||
|
* H2020-EU.3.4. -> SOCIETAL CHALLENGES - Smart, Green And Integrated Transport
|
||||||
|
*
|
||||||
|
* will become
|
||||||
|
*
|
||||||
|
* H2020-EU.1.2. -> Future and Emerging Technologies (FET)
|
||||||
|
* H2020-EU.2.2. -> Access to risk finance
|
||||||
|
* H2020-EU.3.4. -> Smart, Green And Integrated Transport
|
||||||
|
*
|
||||||
|
* This holds at all levels in the hierarchy. Hence
|
||||||
|
*
|
||||||
|
* H2020-EU.2.1.2. -> INDUSTRIAL LEADERSHIP - Leadership in enabling and industrial technologies – Nanotechnologies
|
||||||
|
*
|
||||||
|
* will become
|
||||||
|
*
|
||||||
|
* H2020-EU.2.1.2. -> Nanotechnologies
|
||||||
|
*
|
||||||
|
* - Euratom is not given in the way the other programmes are: H2020-EU. but H2020-Euratom- . So we need to write
|
||||||
|
* specific code for it
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
public class PrepareProgramme {
|
public class PrepareProgramme {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||||
|
@ -107,6 +175,17 @@ public class PrepareProgramme {
|
||||||
return csvProgramme;
|
return csvProgramme;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
prepareClassification(h2020Programmes);
|
||||||
|
|
||||||
|
h2020Programmes.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
|
||||||
|
.saveAsTextFile(outputPath);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
||||||
Object[] codedescription = h2020Programmes
|
Object[] codedescription = h2020Programmes
|
||||||
.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
|
.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
|
||||||
.collect()
|
.collect()
|
||||||
|
@ -174,16 +253,13 @@ public class PrepareProgramme {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
h2020Programmes.foreach(csvProgramme -> {
|
||||||
h2020Programmes.map(csvProgramme -> {
|
|
||||||
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
|
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
|
||||||
&& !csvProgramme.getCode().equals("H2020-EC"))
|
&& !csvProgramme.getCode().equals("H2020-EC"))
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
|
csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
|
||||||
else
|
else
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode()));
|
csvProgramme.setClassification(map.get(csvProgramme.getCode()));
|
||||||
return OBJECT_MAPPER.writeValueAsString(csvProgramme);
|
});
|
||||||
}).saveAsTextFile(outputPath);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
|
|
|
@ -24,6 +24,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects only the relevant information collected with the projects: project grant agreement, project programme code and
|
||||||
|
* project topic code for the projects that are also collected from OpenAIRE.
|
||||||
|
*/
|
||||||
public class PrepareProjects {
|
public class PrepareProjects {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||||
|
|
|
@ -3,11 +3,13 @@ package eu.dnetlib.dhp.actionmanager.project;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to store the grande agreement (code) of the collected projects
|
||||||
|
*/
|
||||||
public class ProjectSubset implements Serializable {
|
public class ProjectSubset implements Serializable {
|
||||||
|
|
||||||
private String code;
|
private String code;
|
||||||
private String topiccode;
|
|
||||||
private String topicdescription;
|
|
||||||
|
|
||||||
public String getCode() {
|
public String getCode() {
|
||||||
return code;
|
return code;
|
||||||
|
@ -17,19 +19,4 @@ public class ProjectSubset implements Serializable {
|
||||||
this.code = code;
|
this.code = code;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTopiccode() {
|
|
||||||
return topiccode;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTopiccode(String topiccode) {
|
|
||||||
this.topiccode = topiccode;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTopicdescription() {
|
|
||||||
return topicdescription;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTopicdescription(String topicdescription) {
|
|
||||||
this.topicdescription = topicdescription;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.DbClient;
|
import eu.dnetlib.dhp.common.DbClient;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* queries the OpenAIRE database to get the grant agreement of projects collected from corda__h2020. The code collected
|
||||||
|
* are written on hdfs using the ProjectSubset model
|
||||||
|
*/
|
||||||
public class ReadProjectsFromDB implements Closeable {
|
public class ReadProjectsFromDB implements Closeable {
|
||||||
|
|
||||||
private final DbClient dbClient;
|
private final DbClient dbClient;
|
||||||
|
@ -72,8 +76,6 @@ public class ReadProjectsFromDB implements Closeable {
|
||||||
try {
|
try {
|
||||||
ProjectSubset p = new ProjectSubset();
|
ProjectSubset p = new ProjectSubset();
|
||||||
p.setCode(rs.getString("code"));
|
p.setCode(rs.getString("code"));
|
||||||
// p.setTopiccode(rs.getString("optional1"));
|
|
||||||
// p.setTopicdescription(rs.getString("optional2"));
|
|
||||||
return Arrays.asList(p);
|
return Arrays.asList(p);
|
||||||
|
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
|
|
|
@ -36,6 +36,21 @@ import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that makes the ActionSet. To prepare the AS two joins are needed
|
||||||
|
*
|
||||||
|
* 1. join betweem the collected project subset and the programme extenden with the classification on the grant agreement.
|
||||||
|
* For each entry a
|
||||||
|
* eu.dnetlib.dhp.Project entity is created and the information about H2020Classification is set together with the
|
||||||
|
* h2020topiccode variable
|
||||||
|
* 2. join between the output of the previous step and the topic information on the topic code. Each time a match is
|
||||||
|
* found the h2020topicdescription variable is set.
|
||||||
|
*
|
||||||
|
* To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more
|
||||||
|
* than one programme.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
public class SparkAtomicActionJob {
|
public class SparkAtomicActionJob {
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
Loading…
Reference in New Issue