Merge branch 'beta' into clean_country

This commit is contained in:
Claudio Atzori 2022-09-09 10:38:17 +02:00
commit 14dc909a14
22 changed files with 1178 additions and 173 deletions

View File

@ -5,13 +5,71 @@ import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MakeTarArchive implements Serializable { public class MakeTarArchive implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTarArchive.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
final int gBperSplit = Optional
.ofNullable(parser.get("splitSize"))
.map(Integer::valueOf)
.orElse(10);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
}
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
RemoteIterator<LocatedFileStatus> dirIterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dirIterator.hasNext()) {
LocatedFileStatus fileStatus = dirIterator.next();
Path p = fileStatus.getPath();
String pathString = p.toString();
String entity = pathString.substring(pathString.lastIndexOf("/") + 1);
MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit);
}
}
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException { private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath); Path hdfsWritePath = new Path(outputPath);
if (fileSystem.exists(hdfsWritePath)) { if (fileSystem.exists(hdfsWritePath)) {
@ -21,7 +79,7 @@ public class MakeTarArchive implements Serializable {
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream()); return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
} }
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName)
throws IOException { throws IOException {
Path hdfsWritePath = new Path(outputPath); Path hdfsWritePath = new Path(outputPath);
@ -37,7 +95,7 @@ public class MakeTarArchive implements Serializable {
new Path(inputPath), true); new Path(inputPath), true);
while (iterator.hasNext()) { while (iterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, iterator, ar, 0); writeCurrentFile(fileSystem, dirName, iterator, ar, 0);
} }
} }
@ -59,32 +117,30 @@ public class MakeTarArchive implements Serializable {
new Path(inputPath), true); new Path(inputPath), true);
boolean next = fileStatusListIterator.hasNext(); boolean next = fileStatusListIterator.hasNext();
while (next) { while (next) {
TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar"); try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) {
long current_size = 0; long currentSize = 0;
while (next && current_size < bytesPerSplit) { while (next && currentSize < bytesPerSplit) {
current_size = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, current_size); currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize);
next = fileStatusListIterator.hasNext(); next = fileStatusListIterator.hasNext();
}
partNum += 1;
} }
partNum += 1;
ar.close();
} }
} }
} }
private static long writeCurrentFile(FileSystem fileSystem, String dir_name, private static long writeCurrentFile(FileSystem fileSystem, String dirName,
RemoteIterator<LocatedFileStatus> fileStatusListIterator, RemoteIterator<LocatedFileStatus> fileStatusListIterator,
TarArchiveOutputStream ar, long current_size) throws IOException { TarArchiveOutputStream ar, long currentSize) throws IOException {
LocatedFileStatus fileStatus = fileStatusListIterator.next(); LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath(); Path p = fileStatus.getPath();
String p_string = p.toString(); String pString = p.toString();
if (!p_string.endsWith("_SUCCESS")) { if (!pString.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1); String name = pString.substring(pString.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) { if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10); String tmp = name.substring(0, 10);
if (name.contains(".")) { if (name.contains(".")) {
@ -92,9 +148,9 @@ public class MakeTarArchive implements Serializable {
} }
name = tmp; name = tmp;
} }
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name); TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name);
entry.setSize(fileStatus.getLen()); entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen(); currentSize += fileStatus.getLen();
ar.putArchiveEntry(entry); ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath()); InputStream is = fileSystem.open(fileStatus.getPath());
@ -110,7 +166,7 @@ public class MakeTarArchive implements Serializable {
ar.closeArchiveEntry(); ar.closeArchiveEntry();
} }
return current_size; return currentSize;
} }
} }

View File

@ -0,0 +1,30 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "hdp",
"paramLongName": "hdfsPath",
"paramDescription": "the path used to store the output archive",
"paramRequired": true
},
{
"paramName":"nn",
"paramLongName":"nameNode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName":"ss",
"paramLongName":"splitSize",
"paramDescription": "the maximum size of the archive",
"paramRequired": false
}
]

View File

@ -3,6 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.util.*; import java.util.*;
@ -13,6 +15,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
@ -20,6 +23,7 @@ import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.gson.JsonElement; import com.google.gson.JsonElement;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
@ -42,6 +46,7 @@ public class SparkDownloadOrcidWorks {
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
.ofPattern(ORCID_XML_DATETIME_FORMAT); .ofPattern(ORCID_XML_DATETIME_FORMAT);
public static final String DOWNLOAD_WORKS_REQUEST_SEPARATOR = ",";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -56,7 +61,6 @@ public class SparkDownloadOrcidWorks {
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingPath = parser.get("workingPath");
logger.info("workingPath: {}", workingPath); logger.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
@ -69,32 +73,22 @@ public class SparkDownloadOrcidWorks {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt"); final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdateValue: ", lastUpdateValue);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors"); LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors"); LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works"); LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works"); LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
LongAccumulator maxModifiedWorksLimitAcc = spark
.sparkContext()
.longAccumulator("max_modified_works_limit");
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found"); LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
LongAccumulator errorLoadingJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_json_found");
LongAccumulator errorLoadingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_xml_found");
LongAccumulator errorParsingXMLFoundAcc = spark LongAccumulator errorParsingXMLFoundAcc = spark
.sparkContext() .sparkContext()
.longAccumulator("error_parsing_xml_found"); .longAccumulator("error_parsing_xml_found");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class); .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count()); long authorsCount = updatedAuthorsRDD.count();
updatedAuthorsAcc.setValue(authorsCount);
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> { FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
String orcidId = data._1().toString(); String orcidId = data._1().toString();
@ -106,11 +100,10 @@ public class SparkDownloadOrcidWorks {
if (statusCode.equals("200")) { if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData"); String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) { if (StringUtils.isEmpty(compressedData)) {
errorLoadingJsonFoundAcc.add(1);
} else { } else {
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData); String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
if (StringUtils.isEmpty(authorSummary)) { if (StringUtils.isEmpty(authorSummary)) {
errorLoadingXMLFoundAcc.add(1);
} else { } else {
try { try {
workIdLastModifiedDate = XMLRecordParser workIdLastModifiedDate = XMLRecordParser
@ -125,22 +118,38 @@ public class SparkDownloadOrcidWorks {
errorCodeFoundAcc.add(1); errorCodeFoundAcc.add(1);
} }
parsedAuthorsAcc.add(1); parsedAuthorsAcc.add(1);
workIdLastModifiedDate.forEach((k, v) -> { workIdLastModifiedDate.forEach((k, v) -> {
parsedWorksAcc.add(1); parsedWorksAcc.add(1);
if (isModified(orcidId, v, lastUpdateValue)) { if (isModified(orcidId, v, lastUpdateValue)) {
modifiedWorksAcc.add(1); modifiedWorksAcc.add(1);
workIds.add(orcidId.concat("/work/").concat(k)); workIds.add(k);
} }
}); });
if (workIdLastModifiedDate.size() > 50) { if (workIds.isEmpty()) {
maxModifiedWorksLimitAcc.add(1); return new ArrayList<String>().iterator();
} }
return workIds.iterator(); List<String> worksDownloadUrls = new ArrayList<>();
// Creation of url for reading multiple works (up to 100) with ORCID API
// see this https://github.com/ORCID/ORCID-Source/blob/development/orcid-api-web/tutorial/works.md
List<List<String>> partitionedWorks = Lists.partition(workIds, 100);
partitionedWorks.stream().forEach(p -> {
String worksDownloadUrl = orcidId.concat("/works/");
final StringBuffer buffer = new StringBuffer(worksDownloadUrl);
p.forEach(id -> {
buffer.append(id).append(DOWNLOAD_WORKS_REQUEST_SEPARATOR);
});
String finalUrl = buffer.substring(0, buffer.lastIndexOf(DOWNLOAD_WORKS_REQUEST_SEPARATOR));
worksDownloadUrls.add(finalUrl);
});
return worksDownloadUrls.iterator();
}; };
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> { Function<String, Tuple2<String, String>> downloadWorksFunction = data -> {
String relativeWorkUrl = data; String relativeWorksUrl = data;
String orcidId = relativeWorkUrl.split("/")[0]; String orcidId = relativeWorksUrl.split("/")[0];
final DownloadedRecordData downloaded = new DownloadedRecordData(); final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId); downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastUpdateValue); downloaded.setLastModifiedDate(lastUpdateValue);
@ -149,7 +158,7 @@ public class SparkDownloadOrcidWorks {
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
httpConnector.setAuthToken(token); httpConnector.setAuthToken(token);
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl; String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorksUrl;
DownloadsReport report = new DownloadsReport(); DownloadsReport report = new DownloadsReport();
long startReq = System.currentTimeMillis(); long startReq = System.currentTimeMillis();
boolean downloadCompleted = false; boolean downloadCompleted = false;
@ -167,7 +176,6 @@ public class SparkDownloadOrcidWorks {
} else { } else {
downloaded.setStatusCode(-4); downloaded.setStatusCode(-4);
} }
errorsAcc.add(1);
} }
long endReq = System.currentTimeMillis(); long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq; long reqTime = endReq - startReq;
@ -176,7 +184,6 @@ public class SparkDownloadOrcidWorks {
} }
if (downloadCompleted) { if (downloadCompleted) {
downloaded.setStatusCode(200); downloaded.setStatusCode(200);
downloadedRecordsAcc.add(1);
downloaded downloaded
.setCompressedData( .setCompressedData(
ArgumentApplicationParser ArgumentApplicationParser
@ -185,24 +192,69 @@ public class SparkDownloadOrcidWorks {
return downloaded.toTuple2(); return downloaded.toTuple2();
}; };
FlatMapFunction<Tuple2<String, String>, Tuple2<String, String>> splitWorksFunction = data -> {
List<Tuple2<String, String>> splittedDownloadedWorks = new ArrayList<>();
String jsonData = data._2().toString();
JsonElement jElement = new JsonParser().parse(jsonData);
String orcidId = data._1().toString();
String statusCode = getJsonValue(jElement, "statusCode");
String lastModifiedDate = getJsonValue(jElement, "lastModifiedDate");
String compressedData = getJsonValue(jElement, "compressedData");
String errorMessage = getJsonValue(jElement, "errorMessage");
String works = ArgumentApplicationParser.decompressValue(compressedData);
// split a single xml containing multiple works into multiple xml (a single work for each xml)
List<String> splittedWorks = null;
try {
splittedWorks = XMLRecordParser
.splitWorks(orcidId, works.getBytes(StandardCharsets.UTF_8));
} catch (Throwable t) {
final DownloadedRecordData errDownloaded = new DownloadedRecordData();
errDownloaded.setOrcidId(orcidId);
errDownloaded.setLastModifiedDate(lastModifiedDate);
errDownloaded.setStatusCode(-10);
errDownloaded.setErrorMessage(t.getMessage());
splittedDownloadedWorks.add(errDownloaded.toTuple2());
errorParsingXMLFoundAcc.add(1);
return splittedDownloadedWorks.iterator();
}
splittedWorks.forEach(w -> {
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastModifiedDate);
downloaded.setStatusCode(Integer.parseInt(statusCode));
downloaded.setErrorMessage(errorMessage);
try {
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(w));
} catch (Throwable t) {
downloaded.setStatusCode(-11);
downloaded.setErrorMessage(t.getMessage());
}
splittedDownloadedWorks.add(downloaded.toTuple2());
downloadedRecordsAcc.add(1);
});
return splittedDownloadedWorks.iterator();
};
updatedAuthorsRDD updatedAuthorsRDD
.flatMap(retrieveWorkUrlFunction) .flatMap(retrieveWorkUrlFunction)
.repartition(100) .repartition(100)
.map(downloadWorkFunction) .map(downloadWorksFunction)
.mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) .flatMap(splitWorksFunction)
.mapToPair(w -> new Tuple2<>(new Text(w._1()), new Text(w._2())))
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class); .saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value()); logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value());
logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value()); logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value());
logger.info("parsedWorksAcc: {}", parsedWorksAcc.value()); logger.info("parsedWorksAcc: {}", parsedWorksAcc.value());
logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value()); logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value());
logger.info("maxModifiedWorksLimitAcc: {}", maxModifiedWorksLimitAcc.value());
logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value()); logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value());
logger.info("errorLoadingJsonFoundAcc: {}", errorLoadingJsonFoundAcc.value());
logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value());
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value()); logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value()); logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
logger.info("errorsAcc: {}", errorsAcc.value());
}); });
} }

View File

@ -83,8 +83,6 @@ public class MultiAttemptsHttpConnector {
throw new CollectorException(msg); throw new CollectorException(msg);
} }
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null; InputStream input = null;
try { try {
@ -104,9 +102,9 @@ public class MultiAttemptsHttpConnector {
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken())); urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
} }
if (log.isDebugEnabled()) { // if (log.isDebugEnabled()) {
logHeaderFields(urlConn); // logHeaderFields(urlConn);
} // }
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (is2xx(urlConn.getResponseCode())) { if (is2xx(urlConn.getResponseCode())) {

View File

@ -1,7 +1,11 @@
package eu.dnetlib.doiboost.orcid.xml; package eu.dnetlib.doiboost.orcid.xml;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.mortbay.log.Log; import org.mortbay.log.Log;
@ -34,6 +38,33 @@ public class XMLRecordParser {
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_HISTORY = "history"; private static final String NS_HISTORY = "history";
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history"; private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
private static final String NS_BULK_URL = "http://www.orcid.org/ns/bulk";
private static final String NS_BULK = "bulk";
private static final String namespaceList = " xmlns:internal=\"http://www.orcid.org/ns/internal\"\n" +
" xmlns:education=\"http://www.orcid.org/ns/education\"\n" +
" xmlns:distinction=\"http://www.orcid.org/ns/distinction\"\n" +
" xmlns:deprecated=\"http://www.orcid.org/ns/deprecated\"\n" +
" xmlns:other-name=\"http://www.orcid.org/ns/other-name\"\n" +
" xmlns:membership=\"http://www.orcid.org/ns/membership\"\n" +
" xmlns:error=\"http://www.orcid.org/ns/error\" xmlns:common=\"http://www.orcid.org/ns/common\"\n" +
" xmlns:record=\"http://www.orcid.org/ns/record\"\n" +
" xmlns:personal-details=\"http://www.orcid.org/ns/personal-details\"\n" +
" xmlns:keyword=\"http://www.orcid.org/ns/keyword\" xmlns:email=\"http://www.orcid.org/ns/email\"\n" +
" xmlns:external-identifier=\"http://www.orcid.org/ns/external-identifier\"\n" +
" xmlns:funding=\"http://www.orcid.org/ns/funding\"\n" +
" xmlns:preferences=\"http://www.orcid.org/ns/preferences\"\n" +
" xmlns:address=\"http://www.orcid.org/ns/address\"\n" +
" xmlns:invited-position=\"http://www.orcid.org/ns/invited-position\"\n" +
" xmlns:work=\"http://www.orcid.org/ns/work\" xmlns:history=\"http://www.orcid.org/ns/history\"\n" +
" xmlns:employment=\"http://www.orcid.org/ns/employment\"\n" +
" xmlns:qualification=\"http://www.orcid.org/ns/qualification\"\n" +
" xmlns:service=\"http://www.orcid.org/ns/service\" xmlns:person=\"http://www.orcid.org/ns/person\"\n" +
" xmlns:activities=\"http://www.orcid.org/ns/activities\"\n" +
" xmlns:researcher-url=\"http://www.orcid.org/ns/researcher-url\"\n" +
" xmlns:peer-review=\"http://www.orcid.org/ns/peer-review\"\n" +
" xmlns:bulk=\"http://www.orcid.org/ns/bulk\"\n" +
" xmlns:research-resource=\"http://www.orcid.org/ns/research-resource\"";
private static final String NS_ERROR = "error"; private static final String NS_ERROR = "error";
@ -307,4 +338,65 @@ public class XMLRecordParser {
} }
return authorHistory; return authorHistory;
} }
public static List<String> splitWorks(String orcidId, byte[] bytes)
throws ParseException, XPathParseException, NavException, XPathEvalException, VtdException, ModifyException,
IOException, TranscodeException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
ap.declareXPathNameSpace(NS_BULK, NS_BULK_URL);
List<String> works = new ArrayList<>();
try {
ap.selectXPath("//work:work");
while (ap.evalXPath() != -1) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
long l = vn.getElementFragment();
String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>";
bos.write(xmlHeader.getBytes(StandardCharsets.UTF_8));
bos.write(vn.getXML().getBytes(), (int) l, (int) (l >> 32));
works.add(bos.toString());
bos.close();
}
} catch (Exception e) {
throw new VtdException(e);
}
List<VTDGen> vgModifiers = Arrays.asList(new VTDGen());
List<XMLModifier> xmModifiers = Arrays.asList(new XMLModifier());
List<ByteArrayOutputStream> buffer = Arrays.asList(new ByteArrayOutputStream());
List<String> updatedWorks = works.stream().map(work -> {
vgModifiers.get(0).setDoc(work.getBytes());
try {
vgModifiers.get(0).parse(false);
final VTDNav vnModifier = vgModifiers.get(0).getNav();
xmModifiers.get(0).bind(vnModifier);
vnModifier.toElement(VTDNav.ROOT);
int attr = vnModifier.getAttrVal("put-code");
if (attr > -1) {
xmModifiers
.get(0)
.insertAttribute(
" path=\"/" + orcidId + "/work/" + vnModifier.toNormalizedString(attr) + "\""
+ " " + namespaceList);
}
buffer.set(0, new ByteArrayOutputStream());
xmModifiers.get(0).output(buffer.get(0));
buffer.get(0).close();
return buffer.get(0).toString();
} catch (NavException | ModifyException | IOException | TranscodeException | ParseException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}).collect(Collectors.toList());
return updatedWorks;
}
} }

View File

@ -161,13 +161,11 @@ public class OrcidClientTest {
@Test @Test
@Disabled @Disabled
void testReadBase64CompressedRecord() throws Exception { void testReadBase64CompressedWork() throws Exception {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile); logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
assertEquals(recordFromSeqFile, downloadedRecord);
} }
@Test @Test
@ -337,7 +335,7 @@ public class OrcidClientTest {
@Ignore @Ignore
void testUpdatedRecord() throws Exception { void testUpdatedRecord() throws Exception {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\nrecord updated \n\n" + record); logToFile(testPath, "\n\nrecord updated \n\n" + record);
} }

View File

@ -108,4 +108,12 @@ public class XMLRecordParserTest {
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml)); work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
OrcidClientTest.logToFile(testPath, JsonWriter.create(work)); OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
} }
@Test
void testWorksSplit() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("multiple_downloaded_works.xml"));
XMLRecordParser.splitWorks("0000-0001-7291-3210", xml.getBytes());
}
} }

View File

@ -0,0 +1 @@
H4sIAAAAAAAAAN1Y23LbNhB971dg+NAnkiKpSJZUS2luTerESSd22pm+QSQkISEBFiAlqxn9exe8CaRExkmc4Uw9Y9rEnrO72MUCC14+votCtCVCUs7mhms7BiLM5wFl67nx4fY3a2IgmWAW4JAzMjf2RBqPF5c7Lj7N1APFONnMjYEDPxb8utaFN3Gt8dAZD5R8MHWHk+nF1DUQAlNMzihLiGA4nBubJIlng8Fut7O58GkAz/WAyUGJMH5CqGSRIPVxkjnZRqsgOi+gMqHM72ZqoBqXxIKAShJ0UCuMzuTJhgiL4Yi0M48YnRmRaAnZ2NC4nXnE1CIkBBcd0VFio8D6PIq6ApLLde0wSS464pDLdUYMLnLIohWQBNNQtnObSF3LJ7LfdRouAOXMSAQaOqKgxLWo3eVrzaIBYQldUdIVw1OwrmuVsrxu2vgFoBYlQVZEQMmRrgAdQToXB4EgsoNXAHQOZVsKi9WKuaTdRdFE6lpUZbczlbTMxwZKi4t9O7gA1HISxSHfRxDirkSWGJ35T4pDSMuXdooaTOdLIrbU7yjaAmDU1viXVnYtZ7DLQFxpV7qPmHoFSoKFrzaNVHQs8TquXpEwJsiWkl2XyxVI5y7TsCPjSnrOV1AkeSq6InoCNVCcJhYcQUA6Hh5bKumShjSBpRSny5D6xiIzqH4u8/1q5guidmIrgOfCczzXcoaWN711vdnQm7mPbGfs/X05OIc+0RVimVgRHIRQ5UeNnuWMLce9dUDdaOY59tgdHjWe4ZzozSd5HD+VWX5IYV3DJlNH6chU0IWKqISQHsOZE6uz2LNG04lnTaaTYeWiIrZqVWf5ooudAVrpGy6TReVNRcqG6/Md3GvCjbCoo3Jx4/M4lchCL0KpFqlo6spQZ9VgCdWrKt7igq6p+uN/fYzPNDrfENxz7IcO7n3m2xqbLIxXXG5SjJ7idL1pV1uPeCMfmiDrGROahC35yUXPOHR/UcwFFnskU9hutziEnjSIOfSFcoaeMFQ0iMoJkEG5rVJJ1KigTFIfxaCDMoLWIeURRoKs4ZBR6pI02FcONly5HJxzMPf6I8xFnfu58C1JBbfeQZsc8vW+4NUhDb5Pk8zbxsRrMivZx2SxpMuE3BU666IuLsQoJYtfMSTGD8nnLGOe416YmTtojj7/8LgezCIEylo9RAdzD3u8Glc+HcwtD9Mo88qdHkyWqnZWvcFLjNdEZhLvYmq53sQ5mDhNNlzkk4BLyN5EtzaCKwl6gxkx0ZP85SlMnoTSRB+Kd56uViQx0Yv8/SUPgwgzE90UZHBpr95e2MXIb1yQDPHWfp2P/IH9T0SY6L19VSgVnFHpq7HC7DWEB6Ztoiu7MHSzoRsTPbOtQu2zDUDwOo1iHGITXeejr6COcBhWc3nJkwSLgCvrL/Oh5xseYkGB86rg8NUqc/BNqRln4XhaRgCyrhzJ2RzeMvT7asJ+Ji7YVxBLqch/ltNPQxzQysO/sICe00Svy4ldc/aRKPHh0Fyg+fpr1tLpsi82AbWcy4Ip1mxZfrWVXu2d2Ymfm6ofqzpKLbKFWmFViWcjp1tTu7pSldbpy/PGNET7pq2B8hoOOK28OBHeS00eadexXWc6HDCScuYPGL9znYuzmhuZ6VLNuIigMf6XBCgRGCo+68ATkRLjKwwetdzPqiBhlgl1n11IEq7Oaq2hzp93rRn5vpQRGjxIyjxLerZjTUbO0L2YjkfjRz8yX/e09n9LFpWSPUyBjbzhaDIeI/jHm4zcH1tcYMxS1h4+RzFsrxZ/2DSdk8rTPRRunwvt1iezzt0G4YCyHRx1xTcjG3CPocjmp0v2ZxzFv6gZMCJ+fz6/fju5fffk/Y3Wb4cnnRZX3coyTbhobtxN+Zlo5hBBAprkbe2x4SiPNE3YCFm3/m8yXzY4vRjXGqp+7B8buF7saw1jP8nXG9RePKg1xL14oDfg/SxCveHvxYPaBaMXD7QLTS/2Ty5QvXihXdh62o70C2IvLugX0n5ycLwA97QSywt3TydyccHvJ/vaB4W+DsTyA0Yv9rUPJj0dx9UHml7s6x+E+jkKyw9Q32P9VFZcFAqBeiz+A4MY5OQYIQAA

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<bulk:bulk xmlns:internal="http://www.orcid.org/ns/internal" xmlns:education="http://www.orcid.org/ns/education" xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:deprecated="http://www.orcid.org/ns/deprecated" xmlns:other-name="http://www.orcid.org/ns/other-name" xmlns:membership="http://www.orcid.org/ns/membership" xmlns:error="http://www.orcid.org/ns/error" xmlns:common="http://www.orcid.org/ns/common" xmlns:record="http://www.orcid.org/ns/record" xmlns:personal-details="http://www.orcid.org/ns/personal-details" xmlns:keyword="http://www.orcid.org/ns/keyword" xmlns:email="http://www.orcid.org/ns/email" xmlns:external-identifier="http://www.orcid.org/ns/external-identifier" xmlns:funding="http://www.orcid.org/ns/funding" xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:address="http://www.orcid.org/ns/address" xmlns:invited-position="http://www.orcid.org/ns/invited-position" xmlns:work="http://www.orcid.org/ns/work" xmlns:history="http://www.orcid.org/ns/history" xmlns:employment="http://www.orcid.org/ns/employment" xmlns:qualification="http://www.orcid.org/ns/qualification" xmlns:service="http://www.orcid.org/ns/service" xmlns:person="http://www.orcid.org/ns/person" xmlns:activities="http://www.orcid.org/ns/activities" xmlns:researcher-url="http://www.orcid.org/ns/researcher-url" xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:research-resource="http://www.orcid.org/ns/research-resource">
<work:work put-code="16639612" visibility="public">
<common:created-date>2015-05-23T18:56:52.486Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.454Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL</common:title>
</work:title>
<work:journal-title>5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002}, Ee= {http://www.db.ucsd.edu/webdb2002/papers/43.pdf}, Pages= {13-18}, Title= {The Query Language TQL}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
<work:work put-code="16639628" visibility="public">
<common:created-date>2015-05-23T18:58:18.492Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.455Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL - Demo Presentation</common:title>
</work:title>
<work:journal-title>X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002Demo, Address= {Portoferraio, Italy}, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)}, Month= {June}, Pages= {427-431}, Title= {The Query Language TQL - Demo Presentation}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
</bulk:bulk>

View File

@ -7,5 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout. # A1 uses PatternLayout.
log4j.logger.org = ERROR log4j.logger.org = ERROR
log4j.logger.eu.dnetlib = DEBUG log4j.logger.eu.dnetlib = DEBUG
log4j.logger.eu.dnetlib.doiboost.orcid = INFO
log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -2,5 +2,6 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"r", "paramLongName":"relationPath", "paramDescription": "the relation resolved Path", "paramRequired": true}, {"paramName":"r", "paramLongName":"relationPath", "paramDescription": "the relation resolved Path", "paramRequired": true},
{"paramName":"s", "paramLongName":"summaryPath", "paramDescription": "the summary Path", "paramRequired": true}, {"paramName":"s", "paramLongName":"summaryPath", "paramDescription": "the summary Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target base path of the scholix", "paramRequired": true} {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target base path of the scholix", "paramRequired": true},
{"paramName":"dc", "paramLongName":"dumpCitations", "paramDescription": "should dump citation relations", "paramRequired": false}
] ]

View File

@ -16,7 +16,11 @@
<name>maxNumberOfPid</name> <name>maxNumberOfPid</name>
<description>filter relation with at least #maxNumberOfPid</description> <description>filter relation with at least #maxNumberOfPid</description>
</property> </property>
<property>
<name>dumpCitations</name>
<value>false</value>
<description>should dump citation relations</description>
</property>
</parameters> </parameters>
<start to="ImportDatasetEntities"/> <start to="ImportDatasetEntities"/>
@ -98,6 +102,7 @@
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg> <arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg> <arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
<arg>--relationPath</arg><arg>${targetPath}/relation</arg> <arg>--relationPath</arg><arg>${targetPath}/relation</arg>
<arg>--dumpCitations</arg><arg>${dumpCitations}</arg>
</spark> </spark>
<ok to="DropJSONPath"/> <ok to="DropJSONPath"/>
<error to="Kill"/> <error to="Kill"/>
@ -135,11 +140,21 @@
<arg>--objectType</arg><arg>scholix</arg> <arg>--objectType</arg><arg>scholix</arg>
<arg>--maxPidNumberFilter</arg><arg>maxNumberOfPid</arg> <arg>--maxPidNumberFilter</arg><arg>maxNumberOfPid</arg>
</spark> </spark>
<ok to="make_tar"/>
<error to="Kill"/>
</action>
<action name="make_tar">
<java>
<main-class>eu.dnetlib.dhp.common.MakeTarArchive</main-class>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--hdfsPath</arg><arg>${targetPath}/tar</arg>
<arg>--sourcePath</arg><arg>${targetPath}/json</arg>
</java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,7 +1,10 @@
package eu.dnetlib.dhp.sx.graph package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
@ -9,7 +12,8 @@ import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.reflect.ClassTag
import scala.util.Try
object SparkConvertRDDtoDataset { object SparkConvertRDDtoDataset {
@ -36,11 +40,12 @@ object SparkConvertRDDtoDataset {
val t = parser.get("targetPath") val t = parser.get("targetPath")
log.info(s"targetPath -> $t") log.info(s"targetPath -> $t")
val filterRelation = parser.get("filterRelation") val subRelTypeFilter = parser.get("filterRelation")
log.info(s"filterRelation -> $filterRelation") log.info(s"filterRelation -> $subRelTypeFilter")
val entityPath = s"$t/entities" val entityPath = s"$t/entities"
val relPath = s"$t/relation" val relPath = s"$t/relation"
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset]) implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication]) implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
@ -99,44 +104,66 @@ object SparkConvertRDDtoDataset {
log.info("Converting Relation") log.info("Converting Relation")
if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) { val relClassFilter = List(
ModelConstants.MERGES,
ModelConstants.IS_MERGED_IN,
ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
)
val rddRelation = spark.sparkContext val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation") .textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation])) .map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations .filter(r => filterRelations(subRelTypeFilter, relClassFilter, r))
.filter(r => //filter OpenCitations relations
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => .filter(r =>
"opencitations".equalsIgnoreCase(k.getValue) r.getDataInfo.getProvenanceaction != null &&
) !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
)
.filter(r => r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} else {
val relationSemanticFilter = List(
"merges",
"ismergedin",
"HasAmongTopNSimilarDocuments",
"IsAmongTopNSimilarDocuments"
) )
val rddRelation = spark.sparkContext spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations
.filter(r =>
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
"opencitations".equalsIgnoreCase(k.getValue)
)
)
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
}
} }
private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = {
if (StringUtils.isNotBlank(subRelTypeFilter)) {
subRelTypeFilter.equalsIgnoreCase(r.getSubRelType)
} else {
!relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
}
}
/*
//TODO: finalise implementation
private def processResult[T<: Result](
implicit ct: ClassTag[T],
log: Logger,
spark: SparkSession,
sourcePath: String,
entityPath: String,
clazz: Class[T]
): Unit = {
val entityType = clazz.getSimpleName.toLowerCase
log.info(s"Converting $entityType")
val mapper = new ObjectMapper() with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
val rdd = spark.sparkContext
.textFile(s"$sourcePath/$entityType")
.map(s => mapper.readValue(s, clazz))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference);
implicit val encoder: Encoder[T] = Encoders.kryo(clazz)
spark
.createDataset(rdd)
.as[T]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/$entityType")
}
*/
} }

View File

@ -12,6 +12,8 @@ import org.apache.spark.sql.functions.count
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.util.Try
object SparkCreateScholix { object SparkCreateScholix {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
@ -37,6 +39,8 @@ object SparkCreateScholix {
log.info(s"summaryPath -> $summaryPath") log.info(s"summaryPath -> $summaryPath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
val dumpCitations = Try(parser.get("dumpCitations").toBoolean).getOrElse(false)
log.info(s"dumpCitations -> $dumpCitations")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
@ -138,7 +142,7 @@ object SparkCreateScholix {
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read
.load(s"$targetPath/related_entities") .load(s"$targetPath/related_entities")
.as[RelatedEntities] .as[RelatedEntities]
.filter(r => r.relatedPublication > 0 || r.relatedDataset > 0) .filter(r => dumpCitations || r.relatedPublication > 0 || r.relatedDataset > 0)
relatedEntitiesDS relatedEntitiesDS
.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner") .joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner")

View File

@ -926,6 +926,24 @@ class MappersTest {
// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); // assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
} }
@Test
void testROHub2() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
}
@Test
void testRiunet() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
}
private void assertValidId(final String id) { private void assertValidId(final String id) {
// System.out.println(id); // System.out.println(id);

View File

@ -1049,6 +1049,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier
dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier
dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle
dnet:pid_types @=@ dnet:pid_types @=@ data.europa.eu @=@ EU Persistent URL dnet:pid_types @=@ dnet:pid_types @=@ data.europa.eu @=@ EU Persistent URL
dnet:pid_types @=@ dnet:pid_types @=@ w3id @=@ w3id.org
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications

View File

@ -0,0 +1,93 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="HTTP://www.openarchives.org/OAI/2.0/">
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:riunet.upv.es:10251/178464</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2022-05-10T09:12:14Z</datestamp>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">com_10251_3822</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">col_10251_169956</setSpec>
<dr:dateOfTransformation>2022-09-01T07:51:12.657Z</dr:dateOfTransformation>
<dri:objIdentifier>od______1560::8f7a139735f493882bb0f4abceb6e200</dri:objIdentifier>
<dri:recordIdentifier>od______1560::8f7a139735f493882bb0f4abceb6e200</dri:recordIdentifier>
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
<oaf:datasourceprefix>riunet________</oaf:datasourceprefix>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="DOI">10.4995/ijpme.2020.12944</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="Handle">10251/148537</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers/>
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
<datacite:version>VoR</datacite:version>
<datacite:rightsList>
<datacite:rights rightsURI=" http://creativecommons.org/licenses/by-nc-nd/4.0/">http://creativecommons.org/licenses/by-nc-nd/4.0/</datacite:rights>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:rightsList>
<datacite:titles>
<datacite:title>Production planning in 3D printing factories</datacite:title>
</datacite:titles>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">[EN] Production planning in 3D printing factories brings new challenges among which the scheduling of parts to be produced stands out. A main issue is to increase the efficiency of the plant and 3D printers productivity. Planning, scheduling, and nesting in 3D printing are recurrent problems in the search for new techniques to promote the development of this technology. In this work, we address the problem for the suppliers that have to schedule their daily production. This problem is part of the LONJA3D model, a managed 3D printing market where the parts ordered by the customers are reorganized into new batches so that suppliers can optimize their production capacity. In this paper, we propose a method derived from the design of combinatorial auctions to solve the nesting problem in 3D printing. First, we propose the use of a heuristic to create potential manufacturing batches. Then, we compute the expected return for each batch. The selected batch should generate the highest income. Several experiments have been tested to validate the process. This method is a first approach to the planning problem in 3D printing and further research is proposed to improve the procedure.</datacite:description>
<datacite:description descriptionType="Abstract">This research has been partially financed by the project: “Lonja de Impresión 3D para la Industria 4.0 y la Empresa Digital (LONJA3D)” funded by the Regional Government of Castile and Leon and the European Regional Development Fund (ERDF, FEDER) with grant VA049P17.</datacite:description>
</datacite:descriptions>
<datacite:language>eng</datacite:language>
<datacite:publisher>Universitat Politècnica de València</datacite:publisher>
<datacite:formats>
<datacite:format>application/pdf</datacite:format>
<datacite:format>716912</datacite:format>
</datacite:formats>
<datacite:fundingReferences>
<datacite:fundingReference>
<datacite:funderName>Junta de Castilla y León</datacite:funderName>
<datacite:funderIdentifier funderIdentifierType="Crossref Funder ID">http://dx.doi.org/10.13039/501100014180</datacite:funderIdentifier>
<datacite:awardNumber>VA049P17</datacite:awardNumber>
</datacite:fundingReference>
</datacite:fundingReferences>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>De Antón, J.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Senovilla, J.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>González, J.M.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Acebes, F.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Pajares, J.</datacite:creatorName>
</datacite:creator>
</datacite:creators>
<datacite:contributors/>
<datacite:dates>
<datacite:date dateType="Issued">2020-07-18</datacite:date>
</datacite:dates>
<datacite:subjects>
<datacite:subject>Additive manufacturing</datacite:subject>
<datacite:subject>Production planning</datacite:subject>
<datacite:subject>Packing problem</datacite:subject>
<datacite:subject>Optimization</datacite:subject>
<datacite:subject>Nesting</datacite:subject>
</datacite:subjects>
</datacite:resource>
<oaf:identifier identifierType="DOI">10.4995/ijpme.2020.12944</oaf:identifier>
<oaf:identifier identifierType="Handle">10251/148537</oaf:identifier>
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
<oaf:dateAccepted>2020-07-18</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language>eng</oaf:language>
<oaf:hostedBy name="RiuNet" id="opendoar____::1560"/>
<oaf:collectedFrom name="RiuNet" id="opendoar____::1560"/>
</metadata>
</record>

View File

@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</identifier>
<datestamp>2022-09-02T09:55:35Z</datestamp>
<setSpec>rohub_data</setSpec>
<setSpec>ro-crate_data</setSpec>
<dri:objIdentifier>fsh_____4119::afc7592914ae190a50570db90f55f9c2</dri:objIdentifier>
<dri:recordIdentifier>https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</dri:recordIdentifier>
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
<oaf:datasourceprefix>fsh_____4119</oaf:datasourceprefix>
<dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="w3id">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</datacite:identifier>
<datacite:alternateIdentifiers/>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">
https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce
</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">
https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d
</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_1843">RO-crate</datacite:resourceType>
<datacite:rightsList>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:rightsList>
<datacite:titles>
<datacite:title>Using biological effects tools to define Good Environmental Status under the European Union Marine Strategy Framework Directive</datacite:title>
</datacite:titles>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved.</datacite:description>
</datacite:descriptions>
<datacite:publisher>Poznań Supercomputing and Networking Center</datacite:publisher>
<contributors xmlns="http://datacite.org/schema/kernel-4">
<contributor>
<contributor contributorType="Researcher">
<contributorName>Generation Service</contributorName>
</contributor>
</contributor>
</contributors>
<creators xmlns="http://datacite.org/schema/kernel-4">
<creator>
<creator>
<creatorName>CNR-ISMAR</creatorName>
</creator>
</creator>
</creators>
<dates xmlns="http://datacite.org/schema/kernel-4">
<date dateType="Created">2018-06-20T11:21:46Z</date>
</dates>
<dc:descriptions>
<dc:description descriptionType="Abstract">The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved.</dc:description>
</dc:descriptions>
<dc:publicationYear>2018</dc:publicationYear>
<rightsList xmlns="http://datacite.org/schema/kernel-4">
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
</rightsList>
<sizes xmlns="http://datacite.org/schema/kernel-4">
<size>3.866 KB</size>
</sizes>
<subjects xmlns="http://datacite.org/schema/kernel-4">
<subject>Ecology</subject>
<subject>EOSC::RO-crate</subject>
</subjects>
</datacite:resource>
<oaf:identifier identifierType="w3id">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</oaf:identifier>
<dr:CobjCategory type="other research product">other research product</dr:CobjCategory>
<oaf:dateAccepted/>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language/>
<oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
<oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
</metadata>
</record>

View File

@ -1,103 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<record xmlns:datacite="http://datacite.org/schema/kernel-4" <record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="http://www.openarchives.org/OAI/2.0/"> <header xmlns="http://www.openarchives.org/OAI/2.0/">
<dri:objIdentifier>eosca5322f5f::4dd1aaf93ae136b65dc9ee4e6f76eac9</dri:objIdentifier> <identifier>https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</identifier>
<dri:recordIdentifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</dri:recordIdentifier> <datestamp>2022-09-02T09:55:35Z</datestamp>
<dri:dateOfCollection>2022-05-25T15:35:48.262Z</dri:dateOfCollection>
<oaf:datasourceprefix>eosca5322f5f</oaf:datasourceprefix>
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
<datestamp>2022-05-25T15:35:38Z</datestamp>
<setSpec>rohub_data</setSpec> <setSpec>rohub_data</setSpec>
<setSpec>ro-crate_data</setSpec> <setSpec>ro-crate_data</setSpec>
<dr:dateOfTransformation>2022-05-25T15:36:11.094Z</dr:dateOfTransformation> <dri:objIdentifier>fsh_____4119::afc7592914ae190a50570db90f55f9c2</dri:objIdentifier>
<dri:recordIdentifier>https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</dri:recordIdentifier>
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
<oaf:datasourceprefix>fsh_____4119</oaf:datasourceprefix>
<dr:dateOfTransformation>2019-04-17T16:04:20.586Z</dr:dateOfTransformation>
</header> </header>
<metadata> <metadata>
<oaire:resource xmlns="http://namespace.openaire.eu/schema/oaire/"> <datacite:resource>
<datacite:identifier identifierType="landingPage">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</datacite:identifier> <datacite:identifier identifierType="URL">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</datacite:identifier>
<datacite:alternateIdentifiers> <datacite:alternateIdentifiers/>
<datacite:alternateIdentifier alternateIdentifierType="URL">http://api.rohub.org/api/ros/53aa90bf-c593-4e6d-923f-d4711ac4b0e1/</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers> <datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="" relationType=""> <datacite:relatedIdentifier relatedIdentifierType="" relationType="">
https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/24fae96f-f986-46e1-bfd0-a21ca20ff0ce
</datacite:relatedIdentifier> </datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="" relationType=""> <datacite:relatedIdentifier relatedIdentifierType="" relationType="">
https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca/resources/6d3427a8-352e-49f4-9796-f618c44dc16d
</datacite:relatedIdentifier> </datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html</datacite:relatedIdentifier>
</datacite:relatedIdentifiers> </datacite:relatedIdentifiers>
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_1843">RO-crate</datacite:resourceType>
<datacite:rightsList>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:rightsList>
<datacite:titles>
<datacite:title>Using biological effects tools to define Good Environmental Status under the European Union Marine Strategy Framework Directive</datacite:title>
</datacite:titles>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved.</datacite:description>
</datacite:descriptions>
<datacite:publisher>Poznań Supercomputing and Networking Center</datacite:publisher>
<contributors xmlns="http://datacite.org/schema/kernel-4">
<contributor>
<contributor contributorType="Researcher">
<contributorName>Generation Service</contributorName>
</contributor>
</contributor>
</contributors>
<creators xmlns="http://datacite.org/schema/kernel-4"> <creators xmlns="http://datacite.org/schema/kernel-4">
<creator> <creator>
<creator> <creator>
<creatorName>Anne Fouilloux</creatorName> <creatorName>CNR-ISMAR</creatorName>
</creator> </creator>
</creator> </creator>
</creators> </creators>
<dates xmlns="http://datacite.org/schema/kernel-4"> <dates xmlns="http://datacite.org/schema/kernel-4">
<date dateType="Created">2021-12-19T21:18:33Z</date> <date dateType="Created">2018-06-20T11:21:46Z</date>
</dates> </dates>
<dc:descriptions> <dc:descriptions>
<dc:description descriptionType="Abstract">The COVID-19 pandemic has led to significant reductions in economic activity, especially during lockdowns. Several studies has shown that the concentration of nitrogen dioxyde and particulate matter levels have reduced during lockdown events. Reductions in transportation sector emissions are most likely largely responsible for the NO2 anomalies. In this study, we analyze the impact of lockdown events on the air quality using data from Copernicus Atmosphere Monitoring Service over Europe and at selected locations.</dc:description> <dc:description descriptionType="Abstract">The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that "Concentrations of contaminants are at levels not giving rise to pollution effects" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved.</dc:description>
</dc:descriptions> </dc:descriptions>
<oaire:fundingReferences> <dc:publicationYear>2018</dc:publicationYear>
<oaire:fundingReference>
<oaire:funderName>European Commission</oaire:funderName>
<oaire:funderIdentifier funderIdentifierType="Crossref Funder ID">10.13039/501100000781</oaire:funderIdentifier>
<oaire:awardNumber awardURI="">101017502</oaire:awardNumber>
<oaire:awardTitle>Research Lifecycle Management for Earth Science Communities and Copernicus Users</oaire:awardTitle>
</oaire:fundingReference>
</oaire:fundingReferences>
<oaire:licenseCondition uri="https://opensource.org/licenses/MIT">MIT License</oaire:licenseCondition>
<dc:publisher>University of Oslo</dc:publisher>
<dc:publicationYear>2021</dc:publicationYear>
<oaire:resourceType resourceTypeGeneral="other research product" uri="http://purl.org/coar/resource_type/c_1843">RO-crate</oaire:resourceType>
<rightsList xmlns="http://datacite.org/schema/kernel-4"> <rightsList xmlns="http://datacite.org/schema/kernel-4">
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights> <rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
</rightsList> </rightsList>
<sizes xmlns="http://datacite.org/schema/kernel-4"> <sizes xmlns="http://datacite.org/schema/kernel-4">
<size>11.971 MB</size> <size>3.866 KB</size>
</sizes> </sizes>
<subjects xmlns="http://datacite.org/schema/kernel-4"> <subjects xmlns="http://datacite.org/schema/kernel-4">
<subject>Applied sciences</subject> <subject>Ecology</subject>
<subject>Meteorology</subject>
<subject>EOSC::RO-crate</subject> <subject>EOSC::RO-crate</subject>
</subjects> </subjects>
<titles xmlns="http://datacite.org/schema/kernel-4"> </datacite:resource>
<title>Impact of the Covid-19 Lockdown on Air quality over Europe</title> <oaf:identifier identifierType="URL">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</oaf:identifier>
</titles> <dr:CobjCategory type="other research product">other research product</dr:CobjCategory>
</oaire:resource>
<oaf:identifier identifierType="URL">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</oaf:identifier>
<dr:CobjCategory type="other">0048</dr:CobjCategory>
<oaf:dateAccepted/> <oaf:dateAccepted/>
<oaf:accessrights>OPEN</oaf:accessrights> <oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://opensource.org/licenses/MIT</oaf:license> <oaf:language/>
<oaf:language>und</oaf:language> <oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
<oaf:hostedBy id="eosc________::psnc::psnc.rohub" name="ROHub"/> <oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
<oaf:collectedFrom id="eosc________::psnc::psnc.rohub" name="ROHub"/>
</metadata> </metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2022-05-25T15:35:48.262Z">
<baseURL>https%3A%2F%2Fapi.rohub.org%2Fapi%2Foai2d%2F</baseURL>
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
<datestamp>2022-05-25T15:35:38Z</datestamp>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk"
classname="Harvested" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record> </record>

View File

@ -81,6 +81,21 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test
public void testRiunet() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
final Publication p = load("riunet.json", Publication.class);
final JoinedEntity je = new JoinedEntity<>(p);
final String record = xmlRecordFactory.build(je);
assertNotNull(record);
testRecordTransformation(record);
}
@Test @Test
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException { public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml")); final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
@ -114,6 +129,8 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test @Test
void testDoiUrlNormalization() throws MalformedURLException { void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -0,0 +1,470 @@
{
"collectedfrom": [
{
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
}
],
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
},
"lastupdatetimestamp": 1662543204165,
"id": "50|od______1560::8f7a139735f493882bb0f4abceb6e200",
"originalId": [
"50|od______1560::8f7a139735f493882bb0f4abceb6e200",
"oai:riunet.upv.es:10251/178464"
],
"pid": [
{
"value": "10251/178464",
"qualifier": {
"classid": "handle",
"classname": "Handle",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofcollection": "2019-03-27T15:15:22.22Z",
"dateoftransformation": "2022-09-01T07:51:12.657Z",
"extraInfo": [],
"oaiprovenance": null,
"processingchargeamount": null,
"processingchargecurrency": null,
"measures": null,
"author": [
{
"fullname": "Nieto Nieto, Justo",
"name": "Justo",
"surname": "Nieto Nieto",
"rank": 1,
"pid": [],
"affiliation": []
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"language": {
"classid": "spa",
"classname": "Spanish; Castilian",
"schemeid": "dnet:languages",
"schemename": "dnet:languages"
},
"country": [],
"subject": [
{
"value": "Justo Nieto Nieto (Discursos)",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Universitat Politècnica de València (UPV)",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Presentación inaugural",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Curso académico 1990-91",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Discurso inaugural",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Inaugural speech",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Inaugural presentation",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"title": [
{
"value": "Discurso de inauguración del curso academico 1990-1991 de la Universitat Politècnica de València",
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"relevantdate": [
{
"value": "1991",
"qualifier": {
"classid": "Issued",
"classname": "Issued",
"schemeid": "dnet:dataCite_date",
"schemename": "dnet:dataCite_date"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"description": [
{
"value": "[ES] Discurso de Justo Nieto en el acto de inauguración del curso académico 1990-1991",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "[EN] Inaugural speech by Justo Nieto at the opening ceremony of the 1990-1991 academic year",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1991-01-01",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"publisher": null,
"embargoenddate": null,
"source": [],
"fulltext": [],
"format": [
{
"value": "application/pdf",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "5055377",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"contributor": [],
"resourcetype": {
"classid": "lecture",
"classname": "lecture",
"schemeid": "dnet:dataCite_resource",
"schemename": "dnet:dataCite_resource"
},
"coverage": [],
"bestaccessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"context": [],
"externalReference": [],
"instance": [
{
"license": null,
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes",
"openAccessRoute": null
},
"instancetype": {
"classid": "0038",
"classname": "Other literature type",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"hostedby": {
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
},
"url": null,
"distributionlocation": null,
"collectedfrom": {
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
},
"pid": [
{
"value": "10251/178464",
"qualifier": {
"classid": "handle",
"classname": "Handle",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"alternateIdentifier": [],
"dateofacceptance": {
"value": "1991-01-01",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"processingchargeamount": null,
"processingchargecurrency": null,
"refereed": {
"classid": "UNKNOWN",
"classname": "Unknown",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"measures": null
}
]
}