added dependency version in main pom.xml for orcid no doi
commit
9818e74a70
@ -0,0 +1,82 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.dump.oaf.graph;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* To store information about the classification for the project. The classification depends on the programme. For example
|
||||
* H2020-EU.3.4.5.3 can be classified as
|
||||
* H2020-EU.3. => Societal Challenges (level1)
|
||||
* H2020-EU.3.4. => Transport (level2)
|
||||
* H2020-EU.3.4.5. => CLEANSKY2 (level3)
|
||||
* H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4)
|
||||
*
|
||||
* We decided to explicitly represent up to three levels in the classification.
|
||||
*
|
||||
* H2020Classification has the following parameters:
|
||||
* - private Programme programme to store the information about the programme related to this classification
|
||||
* - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC)
|
||||
* - private String level2 to store the information about the level2 af the classification (Objectives (?))
|
||||
* - private String level3 to store the information about the level3 of the classification
|
||||
* - private String classification to store the entire classification related to the programme
|
||||
*/
|
||||
public class H2020Classification implements Serializable {
|
||||
private Programme programme;
|
||||
|
||||
private String level1;
|
||||
private String level2;
|
||||
private String level3;
|
||||
|
||||
private String classification;
|
||||
|
||||
public Programme getProgramme() {
|
||||
return programme;
|
||||
}
|
||||
|
||||
public void setProgramme(Programme programme) {
|
||||
this.programme = programme;
|
||||
}
|
||||
|
||||
public String getLevel1() {
|
||||
return level1;
|
||||
}
|
||||
|
||||
public void setLevel1(String level1) {
|
||||
this.level1 = level1;
|
||||
}
|
||||
|
||||
public String getLevel2() {
|
||||
return level2;
|
||||
}
|
||||
|
||||
public void setLevel2(String level2) {
|
||||
this.level2 = level2;
|
||||
}
|
||||
|
||||
public String getLevel3() {
|
||||
return level3;
|
||||
}
|
||||
|
||||
public void setLevel3(String level3) {
|
||||
this.level3 = level3;
|
||||
}
|
||||
|
||||
public String getClassification() {
|
||||
return classification;
|
||||
}
|
||||
|
||||
public void setClassification(String classification) {
|
||||
this.classification = classification;
|
||||
}
|
||||
|
||||
public static H2020Classification newInstance(String programme_code, String programme_description, String level1,
|
||||
String level2, String level3, String classification) {
|
||||
H2020Classification h2020classification = new H2020Classification();
|
||||
h2020classification.programme = Programme.newInstance(programme_code, programme_description);
|
||||
h2020classification.level1 = level1;
|
||||
h2020classification.level2 = level2;
|
||||
h2020classification.level3 = level3;
|
||||
h2020classification.classification = classification;
|
||||
return h2020classification;
|
||||
}
|
||||
}
|
@ -0,0 +1,88 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* To store information about the classification for the project. The classification depends on the programme. For example
|
||||
* H2020-EU.3.4.5.3 can be classified as
|
||||
* H2020-EU.3. => Societal Challenges (level1)
|
||||
* H2020-EU.3.4. => Transport (level2)
|
||||
* H2020-EU.3.4.5. => CLEANSKY2 (level3)
|
||||
* H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4)
|
||||
*
|
||||
* We decided to explicitly represent up to three levels in the classification.
|
||||
*
|
||||
* H2020Classification has the following parameters:
|
||||
* - private Programme programme to store the information about the programme related to this classification
|
||||
* - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC)
|
||||
* - private String level2 to store the information about the level2 af the classification (Objectives (?))
|
||||
* - private String level3 to store the information about the level3 of the classification
|
||||
* - private String classification to store the entire classification related to the programme
|
||||
*/
|
||||
|
||||
public class H2020Classification implements Serializable {
|
||||
private H2020Programme h2020Programme;
|
||||
private String level1;
|
||||
private String level2;
|
||||
private String level3;
|
||||
|
||||
private String classification;
|
||||
|
||||
public H2020Programme getH2020Programme() {
|
||||
return h2020Programme;
|
||||
}
|
||||
|
||||
public void setH2020Programme(H2020Programme h2020Programme) {
|
||||
this.h2020Programme = h2020Programme;
|
||||
}
|
||||
|
||||
public String getLevel1() {
|
||||
return level1;
|
||||
}
|
||||
|
||||
public void setLevel1(String level1) {
|
||||
this.level1 = level1;
|
||||
}
|
||||
|
||||
public String getLevel2() {
|
||||
return level2;
|
||||
}
|
||||
|
||||
public void setLevel2(String level2) {
|
||||
this.level2 = level2;
|
||||
}
|
||||
|
||||
public String getLevel3() {
|
||||
return level3;
|
||||
}
|
||||
|
||||
public void setLevel3(String level3) {
|
||||
this.level3 = level3;
|
||||
}
|
||||
|
||||
public String getClassification() {
|
||||
return classification;
|
||||
}
|
||||
|
||||
public void setClassification(String classification) {
|
||||
this.classification = classification;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o)
|
||||
return true;
|
||||
if (o == null || getClass() != o.getClass())
|
||||
return false;
|
||||
|
||||
H2020Classification h2020classification = (H2020Classification) o;
|
||||
|
||||
return Objects.equals(level1, h2020classification.level1) &&
|
||||
Objects.equals(level2, h2020classification.level2) &&
|
||||
Objects.equals(level3, h2020classification.level3) &&
|
||||
Objects.equals(classification, h2020classification.classification) &&
|
||||
h2020Programme.equals(h2020classification.h2020Programme);
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.csvutils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class CSVProgramme implements Serializable {
|
||||
private String rcn;
|
||||
private String code;
|
||||
private String title;
|
||||
private String shortTitle;
|
||||
private String language;
|
||||
|
||||
public String getRcn() {
|
||||
return rcn;
|
||||
}
|
||||
|
||||
public void setRcn(String rcn) {
|
||||
this.rcn = rcn;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getShortTitle() {
|
||||
return shortTitle;
|
||||
}
|
||||
|
||||
public void setShortTitle(String shortTitle) {
|
||||
this.shortTitle = shortTitle;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
}
|
@ -0,0 +1,137 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The model for the programme csv file
|
||||
*/
|
||||
public class CSVProgramme implements Serializable {
|
||||
private String parentProgramme;
|
||||
private String frameworkProgramme;
|
||||
private String startDate;
|
||||
private String endDate;
|
||||
private String objective;
|
||||
private String subjects;
|
||||
private String legalBasis;
|
||||
private String call;
|
||||
private String rcn;
|
||||
private String code;
|
||||
|
||||
private String title;
|
||||
private String shortTitle;
|
||||
private String language;
|
||||
private String classification;
|
||||
|
||||
public String getClassification() {
|
||||
return classification;
|
||||
}
|
||||
|
||||
public void setClassification(String classification) {
|
||||
this.classification = classification;
|
||||
}
|
||||
|
||||
public String getRcn() {
|
||||
return rcn;
|
||||
}
|
||||
|
||||
public void setRcn(String rcn) {
|
||||
this.rcn = rcn;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getShortTitle() {
|
||||
return shortTitle;
|
||||
}
|
||||
|
||||
public void setShortTitle(String shortTitle) {
|
||||
this.shortTitle = shortTitle;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getParentProgramme() {
|
||||
return parentProgramme;
|
||||
}
|
||||
|
||||
public void setParentProgramme(String parentProgramme) {
|
||||
this.parentProgramme = parentProgramme;
|
||||
}
|
||||
|
||||
public String getFrameworkProgramme() {
|
||||
return frameworkProgramme;
|
||||
}
|
||||
|
||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
||||
this.frameworkProgramme = frameworkProgramme;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public String getObjective() {
|
||||
return objective;
|
||||
}
|
||||
|
||||
public void setObjective(String objective) {
|
||||
this.objective = objective;
|
||||
}
|
||||
|
||||
public String getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(String subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public String getLegalBasis() {
|
||||
return legalBasis;
|
||||
}
|
||||
|
||||
public void setLegalBasis(String legalBasis) {
|
||||
this.legalBasis = legalBasis;
|
||||
}
|
||||
|
||||
public String getCall() {
|
||||
return call;
|
||||
}
|
||||
|
||||
public void setCall(String call) {
|
||||
this.call = call;
|
||||
}
|
||||
}
|
@ -1,8 +1,11 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.csvutils;
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* the mmodel for the projects csv file
|
||||
*/
|
||||
public class CSVProject implements Serializable {
|
||||
private String rcn;
|
||||
private String id;
|
@ -0,0 +1,75 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang.reflect.FieldUtils;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
|
||||
/**
|
||||
* Reads a generic excel file and maps it into classes that mirror its schema
|
||||
*/
|
||||
public class EXCELParser {
|
||||
|
||||
public <R> List<R> parse(InputStream file, String classForName)
|
||||
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
||||
InvalidFormatException {
|
||||
|
||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
||||
OPCPackage pkg = OPCPackage.open(file);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
|
||||
|
||||
List<R> ret = new ArrayList<>();
|
||||
|
||||
DataFormatter dataFormatter = new DataFormatter();
|
||||
Iterator<Row> rowIterator = sheet.rowIterator();
|
||||
List<String> headers = new ArrayList<>();
|
||||
int count = 0;
|
||||
while (rowIterator.hasNext()) {
|
||||
Row row = rowIterator.next();
|
||||
|
||||
if (count == 0) {
|
||||
Iterator<Cell> cellIterator = row.cellIterator();
|
||||
|
||||
while (cellIterator.hasNext()) {
|
||||
Cell cell = cellIterator.next();
|
||||
headers.add(dataFormatter.formatCellValue(cell));
|
||||
}
|
||||
} else {
|
||||
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
|
||||
final Object cc = clazz.newInstance();
|
||||
|
||||
for (int i = 0; i < headers.size(); i++) {
|
||||
Cell cell = row.getCell(i);
|
||||
String value = dataFormatter.formatCellValue(cell);
|
||||
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
||||
|
||||
}
|
||||
|
||||
EXCELTopic et = (EXCELTopic) cc;
|
||||
if (StringUtils.isNotBlank(et.getRcn())) {
|
||||
ret.add((R) cc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,127 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* the model class for the topic excel file
|
||||
*/
|
||||
public class EXCELTopic implements Serializable {
|
||||
private String rcn;
|
||||
private String language;
|
||||
private String code;
|
||||
private String parentProgramme;
|
||||
private String frameworkProgramme;
|
||||
private String startDate;
|
||||
private String endDate;
|
||||
private String title;
|
||||
private String shortTitle;
|
||||
private String objective;
|
||||
private String subjects;
|
||||
private String legalBasis;
|
||||
private String call;
|
||||
|
||||
public String getRcn() {
|
||||
return rcn;
|
||||
}
|
||||
|
||||
public void setRcn(String rcn) {
|
||||
this.rcn = rcn;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getParentProgramme() {
|
||||
return parentProgramme;
|
||||
}
|
||||
|
||||
public void setParentProgramme(String parentProgramme) {
|
||||
this.parentProgramme = parentProgramme;
|
||||
}
|
||||
|
||||
public String getFrameworkProgramme() {
|
||||
return frameworkProgramme;
|
||||
}
|
||||
|
||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
||||
this.frameworkProgramme = frameworkProgramme;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getShortTitle() {
|
||||
return shortTitle;
|
||||
}
|
||||
|
||||
public void setShortTitle(String shortTitle) {
|
||||
this.shortTitle = shortTitle;
|
||||
}
|
||||
|
||||
public String getObjective() {
|
||||
return objective;
|
||||
}
|
||||
|
||||
public void setObjective(String objective) {
|
||||
this.objective = objective;
|
||||
}
|
||||
|
||||
public String getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(String subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public String getLegalBasis() {
|
||||
return legalBasis;
|
||||
}
|
||||
|
||||
public void setLegalBasis(String legalBasis) {
|
||||
this.legalBasis = legalBasis;
|
||||
}
|
||||
|
||||
public String getCall() {
|
||||
return call;
|
||||
}
|
||||
|
||||
public void setCall(String call) {
|
||||
this.call = call;
|
||||
}
|
||||
}
|
@ -0,0 +1,98 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
/**
|
||||
* Applies the parsing of an excel file and writes the Serialization of it in hdfs
|
||||
*/
|
||||
|
||||
public class ReadExcel implements Closeable {
|
||||
private static final Log log = LogFactory.getLog(ReadCSV.class);
|
||||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private InputStream excelFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
ReadCSV.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String fileURL = parser.get("fileURL");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String classForName = parser.get("classForName");
|
||||
|
||||
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
|
||||
|
||||
log.info("Getting Excel file...");
|
||||
readExcel.execute(classForName);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void execute(final String classForName) throws Exception {
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
excelParser
|
||||
.parse(excelFile, classForName)
|
||||
.stream()
|
||||
.forEach(p -> write(p));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public ReadExcel(
|
||||
final String hdfsPath,
|
||||
final String hdfsNameNode,
|
||||
final String fileURL)
|
||||
throws Exception {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
HttpConnector httpConnector = new HttpConnector();
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, false);
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
try {
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(p));
|
||||
writer.newLine();
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.project;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
||||
|
||||
public class EXCELParserTest {
|
||||
|
||||
private static Path workingDir;
|
||||
private HttpConnector httpConnector = new HttpConnector();
|
||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException,
|
||||
IllegalAccessException, InstantiationException {
|
||||
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
|
||||
List<Object> pl = excelParser
|
||||
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic");
|
||||
|
||||
Assertions.assertEquals(3837, pl.size());
|
||||
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,25 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.api.ShortEventMessage;
|
||||
|
||||
public class ShortEventMessageWithGroupId extends ShortEventMessage implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 4704889388757626630L;
|
||||
|
||||
private String group;
|
||||
|
||||
public String getGroup() {
|
||||
return group;
|
||||
}
|
||||
|
||||
public void setGroup(final String group) {
|
||||
this.group = group;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,110 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerEventPayload;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.model.ShortEventMessageWithGroupId;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
|
||||
public class PartitionEventsByDsIdJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
||||
private static final String OPENDOAR_NSPREFIX = "opendoar____::";
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
PartitionEventsByDsIdJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
||||
log.info("partitionPath: {}", partitionPath);
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
|
||||
.limit(10000)
|
||||
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||
.coalesce(1)
|
||||
.write()
|
||||
.partitionBy("group")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(partitionPath);
|
||||
|
||||
});
|
||||
renameSubDirs(partitionPath);
|
||||
|
||||
}
|
||||
|
||||
private static void renameSubDirs(final String path) throws IOException {
|
||||
final FileSystem fs = FileSystem.get(new Configuration());
|
||||
|
||||
log.info("** Renaming subdirs of " + path);
|
||||
for (final FileStatus fileStatus : fs.listStatus(new Path(path))) {
|
||||
if (fileStatus.isDirectory()) {
|
||||
final Path oldPath = fileStatus.getPath();
|
||||
final String oldName = oldPath.getName();
|
||||
if (oldName.contains("=")) {
|
||||
final Path newPath = new Path(path + "/" + StringUtils.substringAfter(oldName, "="));
|
||||
log.info(" * " + oldPath.getName() + " -> " + newPath.getName());
|
||||
fs.rename(oldPath, newPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static ShortEventMessageWithGroupId messageFromNotification(final Event e) {
|
||||
final Gson gson = new Gson();
|
||||
|
||||
final OaBrokerEventPayload payload = gson.fromJson(e.getPayload(), OaBrokerEventPayload.class);
|
||||
|
||||
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
|
||||
|
||||
res.setOriginalId(payload.getResult().getOriginalId());
|
||||
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
|
||||
res.setTopic(e.getTopic());
|
||||
res.setTrust(payload.getTrust());
|
||||
res.generateMessageFromObject(payload.getHighlight());
|
||||
res.setGroup(StringUtils.substringAfter(e.getMap().getTargetDatasourceId(), OPENDOAR_NSPREFIX));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,137 @@
|
||||
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceTypeWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource types</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdBlacklist</name>
|
||||
<value>-</value>
|
||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esEventIndexName</name>
|
||||
<description>the elasticsearch index name for events</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNotificationsIndexName</name>
|
||||
<description>the elasticsearch index name for notifications</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerApiBaseUrl</name>
|
||||
<description>the url of the broker service api</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="index_notifications"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="index_notifications">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>IndexNotificationsOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexNotificationsJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -0,0 +1,32 @@
|
||||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dburl",
|
||||
"paramLongName": "dbUrl",
|
||||
"paramDescription": "the broker database url",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "dbUser",
|
||||
"paramDescription": "the broker database user",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "dbPassword",
|
||||
"paramDescription": "the broker database password",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "broker",
|
||||
"paramLongName": "brokerApiBaseUrl",
|
||||
"paramDescription": "the url of the broker service api",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,93 @@
|
||||
package eu.dnetlib.doiboost.crossref
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object CrossrefDataset {
|
||||
|
||||
|
||||
def extractTimestamp(input:String): Long = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
(json\"indexed"\"timestamp").extractOrElse[Long](0)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val crossrefAggregator = new Aggregator[CrossrefDT, CrossrefDT, CrossrefDT] with Serializable {
|
||||
|
||||
override def zero: CrossrefDT = null
|
||||
|
||||
override def reduce(b: CrossrefDT, a: CrossrefDT): CrossrefDT = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
|
||||
val tb = extractTimestamp(b.json)
|
||||
val ta = extractTimestamp(a.json)
|
||||
if(ta >tb) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
|
||||
override def merge(a: CrossrefDT, b: CrossrefDT): CrossrefDT = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
|
||||
val tb = extractTimestamp(b.json)
|
||||
val ta = extractTimestamp(a.json)
|
||||
if(ta >tb) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[CrossrefDT] = implicitly[Encoder[CrossrefDT]]
|
||||
|
||||
override def outputEncoder: Encoder[CrossrefDT] = implicitly[Encoder[CrossrefDT]]
|
||||
|
||||
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
|
||||
}
|
||||
|
||||
val sourcePath:String = parser.get("sourcePath")
|
||||
val targetPath:String = parser.get("targetPath")
|
||||
|
||||
val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT]
|
||||
|
||||
ds.groupByKey(_.doi)
|
||||
.agg(crossrefAggregator.toColumn)
|
||||
.map(s=>s._2)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
[
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||
|
||||
]
|
@ -0,0 +1,54 @@
|
||||
package eu.dnetlib.dhp.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods._
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
class QueryTest {
|
||||
|
||||
def extract_payload(input:String) :String = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
|
||||
compact(render((json \ "payload")))
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
def hasInstanceWithUrl(p:Publication):Boolean = {
|
||||
val c = p.getInstance.asScala.map(i => i.getUrl!= null && !i.getUrl.isEmpty).size
|
||||
!(!p.getInstance.isEmpty && c == p.getInstance().size)
|
||||
}
|
||||
|
||||
|
||||
def hasNullAccessRights(p:Publication):Boolean = {
|
||||
val c = p.getInstance.asScala.map(i => i.getAccessright!= null && i.getAccessright.getClassname.nonEmpty).size
|
||||
!p.getInstance.isEmpty && c == p.getInstance().size()
|
||||
}
|
||||
|
||||
|
||||
def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
|
||||
|
||||
val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication]
|
||||
|
||||
|
||||
|
||||
ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count()
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,204 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.nonNull(o.getCountry())) {
|
||||
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
|
||||
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
|
||||
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
|
||||
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
|
||||
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
|
||||
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r.getAuthor().forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a.getPid().forEach(p -> {
|
||||
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
protected static <T extends Oaf> T fixDefaults(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
||||
r.setPublisher(null);
|
||||
}
|
||||
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
||||
r
|
||||
.setLanguage(
|
||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
.setResourcetype(
|
||||
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
}
|
||||
if (Objects.isNull(i.getRefereed())) {
|
||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
|
||||
if (Objects.isNull(bestaccessrights)) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
} else {
|
||||
r.setBestaccessright(bestaccessrights);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
boolean nullRank = r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
||||
if (nullRank) {
|
||||
int i = 1;
|
||||
for (Author author : r.getAuthor()) {
|
||||
author.setRank(i++);
|
||||
}
|
||||
}
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
} else {
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||
return p;
|
||||
})
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
||||
LinkedHashMap::new))
|
||||
.values()
|
||||
.stream()
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
||||
q.setSchemeid(vocabularyName);
|
||||
q.setSchemename(vocabularyName);
|
||||
}
|
||||
}
|
||||
|
||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.qualifier(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
public class DatasourceCompatibilityComparator implements Comparator<Qualifier> {
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals("openaire-cris_1.1"))
|
||||
return -1;
|
||||
if (rClass.equals("openaire-cris_1.1"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("openaire4.0"))
|
||||
return -1;
|
||||
if (rClass.equals("openaire4.0"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("driver-openaire2.0"))
|
||||
return -1;
|
||||
if (rClass.equals("driver-openaire2.0"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("driver"))
|
||||
return -1;
|
||||
if (rClass.equals("driver"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("openaire2.0"))
|
||||
return -1;
|
||||
if (rClass.equals("openaire2.0"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("openaire3.0"))
|
||||
return -1;
|
||||
if (rClass.equals("openaire3.0"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("openaire2.0_data"))
|
||||
return -1;
|
||||
if (rClass.equals("openaire2.0_data"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("native"))
|
||||
return -1;
|
||||
if (rClass.equals("native"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("hostedBy"))
|
||||
return -1;
|
||||
if (rClass.equals("hostedBy"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("notCompatible"))
|
||||
return -1;
|
||||
if (rClass.equals("notCompatible"))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN"))
|
||||
return -1;
|
||||
if (rClass.equals("UNKNOWN"))
|
||||
return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
/*
|
||||
* CASE WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY
|
||||
* ['openaire-cris_1.1']) THEN 'openaire-cris_1.1@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT
|
||||
* COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire4.0']) THEN
|
||||
* 'openaire4.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override,
|
||||
* a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0']) THEN
|
||||
* 'driver-openaire2.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE
|
||||
* (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver']) THEN
|
||||
* 'driver@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override,
|
||||
* a.compatibility) :: TEXT) @> ARRAY ['openaire2.0']) THEN 'openaire2.0@@@dnet:datasourceCompatibilityLevel' WHEN
|
||||
* (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0']) THEN
|
||||
* 'openaire3.0@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override,
|
||||
* a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data']) THEN
|
||||
* 'openaire2.0_data@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE
|
||||
* (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native']) THEN
|
||||
* 'native@@@dnet:datasourceCompatibilityLevel' WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override,
|
||||
* a.compatibility) :: TEXT) @> ARRAY ['hostedBy']) THEN 'hostedBy@@@dnet:datasourceCompatibilityLevel' WHEN
|
||||
* (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
|
||||
* THEN 'notCompatible@@@dnet:datasourceCompatibilityLevel' ELSE 'UNKNOWN@@@dnet:datasourceCompatibilityLevel' END
|
||||
*/
|
||||
}
|
@ -1,10 +1,10 @@
|
||||
DROP VIEW IF EXISTS ${hiveDbName}.result;
|
||||
|
||||
CREATE VIEW IF NOT EXISTS result as
|
||||
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p
|
||||
CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.publication p
|
||||
union all
|
||||
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.dataset d
|
||||
union all
|
||||
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.software s
|
||||
union all
|
||||
select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o;
|
||||
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance from ${hiveDbName}.otherresearchproduct o;
|
||||
|
@ -0,0 +1,84 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
|
||||
public class MergeGraphSparkJobTest {
|
||||
|
||||
private ObjectMapper mapper;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeDatasources() throws IOException {
|
||||
assertEquals(
|
||||
"openaire-cris_1.1",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_cris.json"),
|
||||
d("datasource_UNKNOWN.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
assertEquals(
|
||||
"openaire-cris_1.1",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_UNKNOWN.json"),
|
||||
d("datasource_cris.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
assertEquals(
|
||||
"driver-openaire2.0",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_native.json"),
|
||||
d("datasource_driver-openaire2.0.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
assertEquals(
|
||||
"driver-openaire2.0",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_driver-openaire2.0.json"),
|
||||
d("datasource_native.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
assertEquals(
|
||||
"openaire4.0",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_notCompatible.json"),
|
||||
d("datasource_openaire4.0.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
assertEquals(
|
||||
"notCompatible",
|
||||
MergeGraphSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_notCompatible.json"),
|
||||
d("datasource_UNKNOWN.json"))
|
||||
.getOpenairecompatibility()
|
||||
.getClassid());
|
||||
}
|
||||
|
||||
private Optional<Datasource> d(String file) throws IOException {
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream(file));
|
||||
return Optional.of(mapper.readValue(json, Datasource.class));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication
|
||||
import eu.dnetlib.dhp.sx.ebi.EBIAggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
class SparkScholexplorerAggregationTest {
|
||||
|
||||
|
||||
@Test
|
||||
def testFunderRelationshipsMapping(): Unit = {
|
||||
val publications = Source.fromInputStream(getClass.getResourceAsStream("publication.json")).mkString
|
||||
|
||||
var s: List[DLIPublication] = List[DLIPublication]()
|
||||
|
||||
val m: ObjectMapper = new ObjectMapper()
|
||||
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT)
|
||||
|
||||
for (line <- publications.lines) {
|
||||
s = m.readValue(line, classOf[DLIPublication]) :: s
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate()
|
||||
|
||||
|
||||
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]
|
||||
|
||||
val unique = ds.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
|
||||
val uniquePubs: DLIPublication = unique.first()
|
||||
|
||||
s.foreach(pp => assertFalse(pp.getAuthor.isEmpty))
|
||||
|
||||
|
||||
assertNotNull(uniquePubs.getAuthor)
|
||||
assertFalse(uniquePubs.getAuthor.isEmpty)
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "UNKNOWN" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "driver-openaire2.0" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "hostedBy" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "native" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "notCompatible" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0_data" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire3.0" }}
|
@ -0,0 +1 @@
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire4.0" }}
|
@ -0,0 +1,68 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://namespace.openaire.eu/">
|
||||
<dri:objIdentifier>userclaim___::7f0f7807f17db50e5c2b5c452ccaf06d</dri:objIdentifier>
|
||||
<dri:recordIdentifier>userclaim___::7f0f7807f17db50e5c2b5c452ccaf06d</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2020-08-06T07:04:09.62Z</dri:dateOfCollection>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection/>
|
||||
<dr:dateOfTransformation>2020-08-06T07:20:57.911Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>openaire____</oaf:datasourceprefix>
|
||||
</header>
|
||||
<metadata xmlns="http://namespace.openaire.eu/">
|
||||
<dc:title>A case report of serious haemolysis in a glucose-6-phosphate dehydrogenase-deficient COVID-19 patient receiving hydroxychloroquine</dc:title>
|
||||
<dc:creator>Maillart, E.</dc:creator>
|
||||
<dc:creator>Leemans, S.</dc:creator>
|
||||
<dc:creator>Van Noten, H.</dc:creator>
|
||||
<dc:creator>Vandergraesen, T.</dc:creator>
|
||||
<dc:creator>Mahadeb, B.</dc:creator>
|
||||
<dc:creator>Salaouatchi, M. T.</dc:creator>
|
||||
<dc:creator>De Bels, D.</dc:creator>
|
||||
<dc:creator>Clevenbergh, P.</dc:creator>
|
||||
<dc:date/>
|
||||
<dc:identifier>http://dx.doi.org/10.1080/23744235.2020.1774644</dc:identifier>
|
||||
<dc:language/>
|
||||
<dc:publisher>Informa UK Limited</dc:publisher>
|
||||
<dc:source>Crossref</dc:source>
|
||||
<dc:source>Infectious Diseases</dc:source>
|
||||
<dc:subject>Microbiology (medical)</dc:subject>
|
||||
<dc:subject>General Immunology and Microbiology</dc:subject>
|
||||
<dc:subject>Infectious Diseases</dc:subject>
|
||||
<dc:subject>General Medicine</dc:subject>
|
||||
<dc:type>journal-article</dc:type>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2020-06-04</oaf:dateAccepted>
|
||||
<oaf:projectid/>
|
||||
<oaf:accessrights>UNKNOWN</oaf:accessrights>
|
||||
<oaf:hostedBy
|
||||
id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
|
||||
<oaf:collectedFrom id="openaire____::crossref" name="Crossref"/>
|
||||
<oaf:identifier identifierType="doi">10.1080/23744235.2020.1774644</oaf:identifier>
|
||||
<oaf:journal eissn="2374-4243" ep="3" iss="" issn="2374-4235" sp="1" vol="">Infectious Diseases</oaf:journal>
|
||||
</metadata>
|
||||
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2020-08-06T07:04:09.62Z">
|
||||
<baseURL>file%3A%2F%2F%2Fsrv%2Fclaims%2Frecords%2Fpublication%2Fcrossref</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="user:claim" classname="user:claim"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
@ -0,0 +1,102 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<oai:header xmlns="http://namespace.openaire.eu/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
|
||||
<dri:objIdentifier>r3a507cdacc5::03b31980d9bb3c4609e6005c4a3baba6</dri:objIdentifier>
|
||||
<dri:recordIdentifier>oai:lindat.mff.cuni.cz:11372/LRT-1844</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2020-09-04T14:36:48.411Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>r3a507cdacc5</oaf:datasourceprefix>
|
||||
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:lindat.mff.cuni.cz:11372/LRT-1844</identifier>
|
||||
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2016-12-07T11:10:30Z</datestamp>
|
||||
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">hdl_11858_00-097C-0000-0007-710A-A</setSpec>
|
||||
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">hdl_11858_00-097C-0000-0007-710B-8</setSpec>
|
||||
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">openaire_data</setSpec>
|
||||
<dr:dateOfTransformation>2020-09-04T14:39:16.458Z</dr:dateOfTransformation>
|
||||
</oai:header>
|
||||
<metadata>
|
||||
<resource xmlns="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
|
||||
<identifier identifierType="Handle">11372/LRT-1844</identifier>
|
||||
<alternateIdentifiers>
|
||||
<alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/11372/LRT-1844</alternateIdentifier>
|
||||
</alternateIdentifiers>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Hercig, Tomáš</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Brychcín, Tomáš</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Svoboda, Lukáš</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Konkol, Michal</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Steinberger, Josef</creatorName>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>Restaurant Reviews CZ ABSA corpus v2</title>
|
||||
</titles>
|
||||
<publisher>University of West Bohemia, Department of Computer Science and Engineering</publisher>
|
||||
<publicationYear>2016</publicationYear>
|
||||
<contributors>
|
||||
<contributor contributorType="Funder">
|
||||
<contributorName>European Commission</contributorName>
|
||||
<nameIdentifier nameIdentifierScheme="info">info:eu-repo/grantAgreement/EC/FP7/630786</nameIdentifier>
|
||||
</contributor>
|
||||
</contributors>
|
||||
<dates>
|
||||
<date dateType="Issued">2016</date>
|
||||
<date dateType="Accepted">2016-12-07T11:10:30Z</date>
|
||||
<date dateType="Available">2016-12-07T11:10:30Z</date>
|
||||
</dates>
|
||||
<resourceType resourceTypeGeneral="Dataset">corpus</resourceType>
|
||||
<rightsList>
|
||||
<rights rightsURI="info:eu-repo/semantics/openAccess"/>
|
||||
<rights rightsURI="http://creativecommons.org/licenses/by-nc-sa/4.0/"/>
|
||||
</rightsList>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract">Restaurant Reviews CZ ABSA - 2.15k reviews with their related target and category
|
||||
|
||||
The work done is described in the paper: https://doi.org/10.13053/CyS-20-3-2469</description>
|
||||
</descriptions>
|
||||
</resource>
|
||||
<oaf:identifier identifierType="handle">11372/LRT-1844</oaf:identifier>
|
||||
<oaf:embargoenddate>2016-12-07</oaf:embargoenddate>
|
||||
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2016-01-01</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>http://creativecommons.org/licenses/by-nc-sa/4.0/</oaf:license>
|
||||
<oaf:language>und</oaf:language>
|
||||
<oaf:projectid>corda_______::630786</oaf:projectid>
|
||||
<oaf:hostedBy id="re3data_____::r3d100010386" name="LINDAT/CLARIN repository"/>
|
||||
<oaf:collectedFrom id="re3data_____::r3d100010386" name="LINDAT/CLARIN repository"/>
|
||||
</metadata>
|
||||
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2020-09-04T14:36:48.411Z">
|
||||
<baseURL>https%3A%2F%2Flindat.mff.cuni.cz%2Frepository%2Foai%2Fopenaire_data</baseURL>
|
||||
<identifier>oai:lindat.mff.cuni.cz:11372/LRT-1844</identifier>
|
||||
<datestamp>2016-12-07T11:10:30Z</datestamp>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
@ -0,0 +1,10 @@
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters...","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["10.3390/w11050916"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao, Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan, Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson, Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu, Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao, Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao, Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-01","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":"In terms of climate change and precipitation, there is large interest in how large-scale climatic features affect regional rainfall amount and rainfall occurrence. Large-scale climate elements need to be downscaled to the regional level for hydrologic applications. Here, a new Nonhomogeneous Hidden Markov Model (NHMM) called the Bayesian-NHMM is presented for downscaling and predicting of multisite daily rainfall during rainy season over the Huaihe River Basin (HRB). The Bayesian-NHMM provides a Bayesian method for parameters estimation. The model avoids the risk to have no solutions for parameter estimation, which often occurs in the traditional NHMM that uses point estimates of parameters. The Bayesian-NHMM accurately captures seasonality and interannual variability of rainfall amount and wet days during the rainy season. The model establishes a link between large-scale meteorological characteristics and local precipitation patterns. It also provides a more stable and efficient method to estimate parameters in the model. These results suggest that prediction of daily precipitation could be improved by the suggested new Bayesian-NHMM method, which can be helpful for water resources management and research on climate change.","dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[{"license":null,"accessright":null,"instancetype":null,"hostedby":{"key":"openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18","value":"Unknown Repository","dataInfo":null},"url":["10.3390/w11050916"],"distributionlocation":null,"collectedfrom":null,"dateofacceptance":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null}],"journal":null,"originalObjIdentifier":"datacite____::100bb045f34ea2da81433d0b9ae3afa1","dlicollectedfrom":[{"id":"dli_________::datacite","name":"Datasets in Datacite","completionStatus":"complete","collectionMode":null}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-10-04T14:16:06.105Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-09-27T11:39:38.835Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-30T11:48:49.809Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-14T14:25:55.176Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"}
|
||||
{"collectedfrom":[{"key":"dli_________::crossref","value":"Crossref","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"id":"50|1307198540d2264d839dfd8c9a19f4a7","originalId":["1307198540d2264d839dfd8c9a19f4a7"],"pid":[{"value":"10.3390/w11050916","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":null}],"dateofcollection":"2020-08-09T11:35:23.526Z","dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"author":[{"fullname":"Cao Qing","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Zhenchun","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Yuan Feifei","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Berndtsson Ronny","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Xu Shijie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Gao Huibin","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null},{"fullname":"Hao Jie","name":null,"surname":null,"rank":null,"pid":null,"affiliation":null}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"publication","schemename":"publication"},"language":null,"country":null,"subject":[],"title":[{"value":"On the Predictability of Daily Rainfall during Rainy Season over the Huaihe River Basin","qualifier":{"classid":"main title","classname":null,"schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":null}],"relevantdate":[{"value":"2019-05-02T07:15:22Z","qualifier":{"classid":"date","classname":"date","schemeid":"dnet::date","schemename":"dnet::date"},"dataInfo":null}],"description":[{"value":null,"dataInfo":null}],"dateofacceptance":null,"publisher":{"value":"MDPI AG","dataInfo":null},"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":[],"journal":null,"originalObjIdentifier":"dli_resolver::1307198540d2264d839dfd8c9a19f4a7","dlicollectedfrom":[{"id":"dli_________::crossref","name":"Crossref","completionStatus":"complete","collectionMode":"resolved"}],"completionStatus":"complete"}
|
@ -0,0 +1,111 @@
|
||||
|
||||
package eu.dnetlib.dhp.export.zenodo;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class MakeTar implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
MakeTar.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("targetPath");
|
||||
log.info("hdfsPath: {}", outputPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", hdfsNameNode);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("input path : {}", inputPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
makeTArArchive(fileSystem, inputPath, outputPath);
|
||||
|
||||
}
|
||||
|
||||
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
|
||||
|
||||
while (dir_iterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = dir_iterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
|
||||
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
|
||||
throws IOException {
|
||||
|
||||
Path hdfsWritePath = new Path(outputPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, true);
|
||||
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
new Path(inputPath), true);
|
||||
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
|
||||
entry.setSize(fileStatus.getLen());
|
||||
ar.putArchiveEntry(entry);
|
||||
|
||||
InputStream is = fileSystem.open(fileStatus.getPath());
|
||||
|
||||
BufferedInputStream bis = new BufferedInputStream(is);
|
||||
|
||||
int count;
|
||||
byte data[] = new byte[1024];
|
||||
while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||
ar.write(data, 0, count);
|
||||
}
|
||||
bis.close();
|
||||
ar.closeArchiveEntry();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ar.close();
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue