forked from D-Net/dnet-hadoop
Merge branch 'stable_ids' into scholexplorer_model_update
This commit is contained in:
commit
adc317ccb6
|
@ -56,6 +56,7 @@ object ImportDatacite {
|
|||
val hdfsTargetPath = new Path(targetPath)
|
||||
log.info(s"hdfsTargetPath is $hdfsTargetPath")
|
||||
|
||||
val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt
|
||||
|
||||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
@ -110,7 +111,7 @@ object ImportDatacite {
|
|||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf)
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
|
@ -137,7 +138,7 @@ object ImportDatacite {
|
|||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = {
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
|
||||
var from:Long = timestamp * 1000
|
||||
val delta:Long = 50000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
|
@ -148,7 +149,7 @@ object ImportDatacite {
|
|||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
client = new DataciteAPIImporter(from, 100, from + delta)
|
||||
client = new DataciteAPIImporter(from, bs, from + delta)
|
||||
var end: Long = 0
|
||||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
|
|
|
@ -143,7 +143,6 @@ public class PrepareProgramme {
|
|||
|
||||
JavaRDD<CSVProgramme> h2020Programmes = programme
|
||||
.toJavaRDD()
|
||||
.filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020"))
|
||||
.mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme))
|
||||
.reduceByKey((a, b) -> {
|
||||
if (!a.getLanguage().equals("en")) {
|
||||
|
|
|
@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
@ -32,7 +31,6 @@ public class PrepareProjects {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final HashMap<String, CSVProgramme> programmeMap = new HashMap<>();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
|
|
@ -120,7 +120,6 @@ public class SparkAtomicActionJob {
|
|||
.map((MapFunction<Tuple2<CSVProject, CSVProgramme>, Project>) c -> {
|
||||
|
||||
CSVProject csvProject = c._1();
|
||||
Optional<CSVProgramme> ocsvProgramme = Optional.ofNullable(c._2());
|
||||
|
||||
return Optional
|
||||
.ofNullable(c._2())
|
||||
|
@ -135,9 +134,9 @@ public class SparkAtomicActionJob {
|
|||
H2020Programme pm = new H2020Programme();
|
||||
H2020Classification h2020classification = new H2020Classification();
|
||||
pm.setCode(csvProject.getProgramme());
|
||||
h2020classification.setClassification(ocsvProgramme.get().getClassification());
|
||||
h2020classification.setClassification(csvProgramme.getClassification());
|
||||
h2020classification.setH2020Programme(pm);
|
||||
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
|
||||
setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
|
||||
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
||||
pp.setH2020classification(Arrays.asList(h2020classification));
|
||||
|
||||
|
@ -145,10 +144,11 @@ public class SparkAtomicActionJob {
|
|||
})
|
||||
.orElse(null);
|
||||
|
||||
}, Encoders.bean(Project.class));
|
||||
}, Encoders.bean(Project.class))
|
||||
.filter(Objects::nonNull);
|
||||
|
||||
aaproject
|
||||
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")))
|
||||
.joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left")
|
||||
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
|
||||
Optional<EXCELTopic> op = Optional.ofNullable(p._2());
|
||||
Project rp = p._1();
|
||||
|
|
|
@ -7,14 +7,7 @@ import java.io.Serializable;
|
|||
* The model for the programme csv file
|
||||
*/
|
||||
public class CSVProgramme implements Serializable {
|
||||
private String parentProgramme;
|
||||
private String frameworkProgramme;
|
||||
private String startDate;
|
||||
private String endDate;
|
||||
private String objective;
|
||||
private String subjects;
|
||||
private String legalBasis;
|
||||
private String call;
|
||||
|
||||
private String rcn;
|
||||
private String code;
|
||||
|
||||
|
@ -80,67 +73,5 @@ public class CSVProgramme implements Serializable {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public String getParentProgramme() {
|
||||
return parentProgramme;
|
||||
}
|
||||
|
||||
public void setParentProgramme(String parentProgramme) {
|
||||
this.parentProgramme = parentProgramme;
|
||||
}
|
||||
|
||||
public String getFrameworkProgramme() {
|
||||
return frameworkProgramme;
|
||||
}
|
||||
|
||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
||||
this.frameworkProgramme = frameworkProgramme;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public String getObjective() {
|
||||
return objective;
|
||||
}
|
||||
|
||||
public void setObjective(String objective) {
|
||||
this.objective = objective;
|
||||
}
|
||||
|
||||
public String getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(String subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public String getLegalBasis() {
|
||||
return legalBasis;
|
||||
}
|
||||
|
||||
public void setLegalBasis(String legalBasis) {
|
||||
this.legalBasis = legalBasis;
|
||||
}
|
||||
|
||||
public String getCall() {
|
||||
return call;
|
||||
}
|
||||
|
||||
public void setCall(String call) {
|
||||
this.call = call;
|
||||
}
|
||||
//
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ public class EXCELParser {
|
|||
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
||||
InvalidFormatException {
|
||||
|
||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
||||
OPCPackage pkg = OPCPackage.open(file);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
|
@ -58,7 +57,6 @@ public class EXCELParser {
|
|||
|
||||
for (int i = 0; i < headers.size(); i++) {
|
||||
Cell cell = row.getCell(i);
|
||||
String value = dataFormatter.formatCellValue(cell);
|
||||
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
||||
|
||||
}
|
||||
|
|
|
@ -18,6 +18,12 @@
|
|||
"paramDescription": "avoid to downlaod new items but apply the previous update",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "bs",
|
||||
"paramLongName": "blocksize",
|
||||
"paramDescription": "define the requests block size",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="H2020Programme" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="H2020Classification" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>projectFileURL</name>
|
||||
|
@ -18,6 +18,10 @@
|
|||
<name>outputPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sheetName</name>
|
||||
<description>the name of the sheet to read</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
@ -31,10 +35,23 @@
|
|||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="get_project_file"/>
|
||||
<ok to="fork_get_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<fork name="fork_get_info">
|
||||
<path start="fork_get_projects"/>
|
||||
<path start="get_programme_file"/>
|
||||
<path start="get_topic_file"/>
|
||||
|
||||
</fork>
|
||||
|
||||
<fork name="fork_get_projects">
|
||||
<path start="get_project_file"/>
|
||||
<path start="read_projects"/>
|
||||
</fork>
|
||||
|
||||
<action name="get_project_file">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV</main-class>
|
||||
|
@ -43,7 +60,7 @@
|
|||
<arg>--hdfsPath</arg><arg>${workingDir}/projects</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProject</arg>
|
||||
</java>
|
||||
<ok to="get_programme_file"/>
|
||||
<ok to="wait_projects"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -55,7 +72,7 @@
|
|||
<arg>--hdfsPath</arg><arg>${workingDir}/programme</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme</arg>
|
||||
</java>
|
||||
<ok to="get_topic_file"/>
|
||||
<ok to="prepare_programme"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -68,7 +85,7 @@
|
|||
<arg>--sheetName</arg><arg>${sheetName}</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
|
||||
</java>
|
||||
<ok to="read_projects"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -81,7 +98,7 @@
|
|||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="prepare_programme"/>
|
||||
<ok to="wait_projects"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -105,10 +122,15 @@
|
|||
<arg>--programmePath</arg><arg>${workingDir}/programme</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedProgramme</arg>
|
||||
</spark>
|
||||
<ok to="prepare_project"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="create_updates"/>
|
||||
|
||||
<join name="wait_projects" to="prepare_project"/>
|
||||
|
||||
|
||||
<action name="prepare_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -130,7 +152,7 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/preparedProjects</arg>
|
||||
<arg>--dbProjectPath</arg><arg>${workingDir}/dbProjects</arg>
|
||||
</spark>
|
||||
<ok to="create_updates"/>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
|
|
@ -20,8 +20,8 @@ import eu.dnetlib.dhp.collection.HttpConnector2;
|
|||
public class EXCELParserTest {
|
||||
|
||||
private static Path workingDir;
|
||||
private final HttpConnector2 httpConnector = new HttpConnector2();
|
||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
||||
private HttpConnector2 httpConnector = new HttpConnector2();
|
||||
private static final String URL = "https://cordis.europa.eu/data/reference/cordisref-h2020topics.xlsx";
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
@ -35,11 +35,12 @@ public class EXCELParserTest {
|
|||
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
|
||||
final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic";
|
||||
final String sheetName = "Topics";
|
||||
List<Object> pl = excelParser.parse(httpConnector.getInputSourceAsStream(URL), classForName, sheetName);
|
||||
List<Object> pl = excelParser
|
||||
.parse(
|
||||
httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic",
|
||||
"Topics");
|
||||
|
||||
Assertions.assertEquals(3837, pl.size());
|
||||
Assertions.assertEquals(3878, pl.size());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
|||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
|
@ -144,7 +145,7 @@ public class ConversionUtils {
|
|||
.filter(pid -> pid != null)
|
||||
.filter(pid -> pid.getQualifier() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
|
||||
.map(pid -> pid.getValue())
|
||||
.map(pid -> cleanOrcid(pid))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -93,7 +93,7 @@ public class PublicationToOaf implements Serializable {
|
|||
{
|
||||
put(
|
||||
ModelConstants.ORCID,
|
||||
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
new Pair<>(ModelConstants.ORCID.toUpperCase(), OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
|
||||
|
||||
}
|
||||
};
|
||||
|
@ -126,8 +126,6 @@ public class PublicationToOaf implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public static final String PID_TYPES = "dnet:pid_types";
|
||||
|
||||
public Oaf generatePublicationActionsFromJson(final String json) {
|
||||
if (parsedPublications != null) {
|
||||
parsedPublications.add(1);
|
||||
|
|
|
@ -24,8 +24,6 @@ public class Constants {
|
|||
|
||||
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
|
||||
|
||||
public static String ORCID = "orcid";
|
||||
|
||||
static {
|
||||
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||
|
|
|
@ -503,7 +503,7 @@ public class ResultMapper implements Serializable {
|
|||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
|
|
|
@ -35,7 +35,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
|
||||
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
ORCID_PENDING, ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
||||
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ import com.google.common.collect.Lists;
|
|||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
@ -56,7 +57,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
author.setPid(new ArrayList<>());
|
||||
|
||||
if (StringUtils.isNotBlank(pid)) {
|
||||
if (type.startsWith("ORCID")) {
|
||||
if (type.toLowerCase().startsWith(ORCID)) {
|
||||
final String cleanedId = pid
|
||||
.replaceAll("http://orcid.org/", "")
|
||||
.replaceAll("https://orcid.org/", "");
|
||||
|
|
|
@ -13,6 +13,7 @@ import org.dom4j.Node;
|
|||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
@ -85,7 +86,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.replaceAll(" ", "")
|
||||
.replaceAll("_", "");
|
||||
|
||||
if (type.startsWith("ORCID")) {
|
||||
if (type.toLowerCase().startsWith(ORCID)) {
|
||||
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
|
||||
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
||||
} else if (type.startsWith("MAGID")) {
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -730,7 +730,7 @@
|
|||
<mockito-core.version>3.3.3</mockito-core.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
<dhp-schemas.version>[2.4.7]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[2.5.9]</dhp-schemas.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
|
|
Loading…
Reference in New Issue