forked from D-Net/dnet-hadoop
Merge pull request 'orcid_multipleworks_download' (#242) from enrico.ottonello/dnet-hadoop:orcid_multipleworks_download into beta
Reviewed-on: D-Net/dnet-hadoop#242
This commit is contained in:
commit
a431e01383
|
@ -3,6 +3,8 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -13,6 +15,7 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
@ -20,6 +23,7 @@ import org.apache.spark.util.LongAccumulator;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
import com.google.gson.JsonElement;
|
import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
@ -42,6 +46,7 @@ public class SparkDownloadOrcidWorks {
|
||||||
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
||||||
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
|
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
|
||||||
.ofPattern(ORCID_XML_DATETIME_FORMAT);
|
.ofPattern(ORCID_XML_DATETIME_FORMAT);
|
||||||
|
public static final String DOWNLOAD_WORKS_REQUEST_SEPARATOR = ",";
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -56,7 +61,6 @@ public class SparkDownloadOrcidWorks {
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
logger.info("workingPath: {}", workingPath);
|
logger.info("workingPath: {}", workingPath);
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
|
@ -69,32 +73,22 @@ public class SparkDownloadOrcidWorks {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
logger.info("lastUpdateValue: ", lastUpdateValue);
|
|
||||||
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
|
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
|
||||||
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
|
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
|
||||||
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
|
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
|
||||||
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
|
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
|
||||||
LongAccumulator maxModifiedWorksLimitAcc = spark
|
|
||||||
.sparkContext()
|
|
||||||
.longAccumulator("max_modified_works_limit");
|
|
||||||
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
|
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
|
||||||
LongAccumulator errorLoadingJsonFoundAcc = spark
|
|
||||||
.sparkContext()
|
|
||||||
.longAccumulator("error_loading_json_found");
|
|
||||||
LongAccumulator errorLoadingXMLFoundAcc = spark
|
|
||||||
.sparkContext()
|
|
||||||
.longAccumulator("error_loading_xml_found");
|
|
||||||
LongAccumulator errorParsingXMLFoundAcc = spark
|
LongAccumulator errorParsingXMLFoundAcc = spark
|
||||||
.sparkContext()
|
.sparkContext()
|
||||||
.longAccumulator("error_parsing_xml_found");
|
.longAccumulator("error_parsing_xml_found");
|
||||||
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||||
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
|
|
||||||
|
|
||||||
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
|
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
|
||||||
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
|
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
|
||||||
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count());
|
long authorsCount = updatedAuthorsRDD.count();
|
||||||
|
updatedAuthorsAcc.setValue(authorsCount);
|
||||||
|
|
||||||
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
|
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
|
||||||
String orcidId = data._1().toString();
|
String orcidId = data._1().toString();
|
||||||
|
@ -106,11 +100,10 @@ public class SparkDownloadOrcidWorks {
|
||||||
if (statusCode.equals("200")) {
|
if (statusCode.equals("200")) {
|
||||||
String compressedData = getJsonValue(jElement, "compressedData");
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
if (StringUtils.isEmpty(compressedData)) {
|
if (StringUtils.isEmpty(compressedData)) {
|
||||||
errorLoadingJsonFoundAcc.add(1);
|
|
||||||
} else {
|
} else {
|
||||||
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
|
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
if (StringUtils.isEmpty(authorSummary)) {
|
if (StringUtils.isEmpty(authorSummary)) {
|
||||||
errorLoadingXMLFoundAcc.add(1);
|
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
workIdLastModifiedDate = XMLRecordParser
|
workIdLastModifiedDate = XMLRecordParser
|
||||||
|
@ -125,22 +118,38 @@ public class SparkDownloadOrcidWorks {
|
||||||
errorCodeFoundAcc.add(1);
|
errorCodeFoundAcc.add(1);
|
||||||
}
|
}
|
||||||
parsedAuthorsAcc.add(1);
|
parsedAuthorsAcc.add(1);
|
||||||
|
|
||||||
workIdLastModifiedDate.forEach((k, v) -> {
|
workIdLastModifiedDate.forEach((k, v) -> {
|
||||||
parsedWorksAcc.add(1);
|
parsedWorksAcc.add(1);
|
||||||
if (isModified(orcidId, v, lastUpdateValue)) {
|
if (isModified(orcidId, v, lastUpdateValue)) {
|
||||||
modifiedWorksAcc.add(1);
|
modifiedWorksAcc.add(1);
|
||||||
workIds.add(orcidId.concat("/work/").concat(k));
|
workIds.add(k);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
if (workIdLastModifiedDate.size() > 50) {
|
if (workIds.isEmpty()) {
|
||||||
maxModifiedWorksLimitAcc.add(1);
|
return new ArrayList<String>().iterator();
|
||||||
}
|
}
|
||||||
return workIds.iterator();
|
List<String> worksDownloadUrls = new ArrayList<>();
|
||||||
|
|
||||||
|
// Creation of url for reading multiple works (up to 100) with ORCID API
|
||||||
|
// see this https://github.com/ORCID/ORCID-Source/blob/development/orcid-api-web/tutorial/works.md
|
||||||
|
|
||||||
|
List<List<String>> partitionedWorks = Lists.partition(workIds, 100);
|
||||||
|
partitionedWorks.stream().forEach(p -> {
|
||||||
|
String worksDownloadUrl = orcidId.concat("/works/");
|
||||||
|
final StringBuffer buffer = new StringBuffer(worksDownloadUrl);
|
||||||
|
p.forEach(id -> {
|
||||||
|
buffer.append(id).append(DOWNLOAD_WORKS_REQUEST_SEPARATOR);
|
||||||
|
});
|
||||||
|
String finalUrl = buffer.substring(0, buffer.lastIndexOf(DOWNLOAD_WORKS_REQUEST_SEPARATOR));
|
||||||
|
worksDownloadUrls.add(finalUrl);
|
||||||
|
});
|
||||||
|
return worksDownloadUrls.iterator();
|
||||||
};
|
};
|
||||||
|
|
||||||
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> {
|
Function<String, Tuple2<String, String>> downloadWorksFunction = data -> {
|
||||||
String relativeWorkUrl = data;
|
String relativeWorksUrl = data;
|
||||||
String orcidId = relativeWorkUrl.split("/")[0];
|
String orcidId = relativeWorksUrl.split("/")[0];
|
||||||
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||||
downloaded.setOrcidId(orcidId);
|
downloaded.setOrcidId(orcidId);
|
||||||
downloaded.setLastModifiedDate(lastUpdateValue);
|
downloaded.setLastModifiedDate(lastUpdateValue);
|
||||||
|
@ -149,7 +158,7 @@ public class SparkDownloadOrcidWorks {
|
||||||
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
|
||||||
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
|
||||||
httpConnector.setAuthToken(token);
|
httpConnector.setAuthToken(token);
|
||||||
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl;
|
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorksUrl;
|
||||||
DownloadsReport report = new DownloadsReport();
|
DownloadsReport report = new DownloadsReport();
|
||||||
long startReq = System.currentTimeMillis();
|
long startReq = System.currentTimeMillis();
|
||||||
boolean downloadCompleted = false;
|
boolean downloadCompleted = false;
|
||||||
|
@ -167,7 +176,6 @@ public class SparkDownloadOrcidWorks {
|
||||||
} else {
|
} else {
|
||||||
downloaded.setStatusCode(-4);
|
downloaded.setStatusCode(-4);
|
||||||
}
|
}
|
||||||
errorsAcc.add(1);
|
|
||||||
}
|
}
|
||||||
long endReq = System.currentTimeMillis();
|
long endReq = System.currentTimeMillis();
|
||||||
long reqTime = endReq - startReq;
|
long reqTime = endReq - startReq;
|
||||||
|
@ -176,7 +184,6 @@ public class SparkDownloadOrcidWorks {
|
||||||
}
|
}
|
||||||
if (downloadCompleted) {
|
if (downloadCompleted) {
|
||||||
downloaded.setStatusCode(200);
|
downloaded.setStatusCode(200);
|
||||||
downloadedRecordsAcc.add(1);
|
|
||||||
downloaded
|
downloaded
|
||||||
.setCompressedData(
|
.setCompressedData(
|
||||||
ArgumentApplicationParser
|
ArgumentApplicationParser
|
||||||
|
@ -185,24 +192,69 @@ public class SparkDownloadOrcidWorks {
|
||||||
return downloaded.toTuple2();
|
return downloaded.toTuple2();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
FlatMapFunction<Tuple2<String, String>, Tuple2<String, String>> splitWorksFunction = data -> {
|
||||||
|
List<Tuple2<String, String>> splittedDownloadedWorks = new ArrayList<>();
|
||||||
|
String jsonData = data._2().toString();
|
||||||
|
JsonElement jElement = new JsonParser().parse(jsonData);
|
||||||
|
String orcidId = data._1().toString();
|
||||||
|
String statusCode = getJsonValue(jElement, "statusCode");
|
||||||
|
String lastModifiedDate = getJsonValue(jElement, "lastModifiedDate");
|
||||||
|
String compressedData = getJsonValue(jElement, "compressedData");
|
||||||
|
String errorMessage = getJsonValue(jElement, "errorMessage");
|
||||||
|
String works = ArgumentApplicationParser.decompressValue(compressedData);
|
||||||
|
|
||||||
|
// split a single xml containing multiple works into multiple xml (a single work for each xml)
|
||||||
|
List<String> splittedWorks = null;
|
||||||
|
try {
|
||||||
|
splittedWorks = XMLRecordParser
|
||||||
|
.splitWorks(orcidId, works.getBytes(StandardCharsets.UTF_8));
|
||||||
|
} catch (Throwable t) {
|
||||||
|
final DownloadedRecordData errDownloaded = new DownloadedRecordData();
|
||||||
|
errDownloaded.setOrcidId(orcidId);
|
||||||
|
errDownloaded.setLastModifiedDate(lastModifiedDate);
|
||||||
|
errDownloaded.setStatusCode(-10);
|
||||||
|
errDownloaded.setErrorMessage(t.getMessage());
|
||||||
|
splittedDownloadedWorks.add(errDownloaded.toTuple2());
|
||||||
|
errorParsingXMLFoundAcc.add(1);
|
||||||
|
return splittedDownloadedWorks.iterator();
|
||||||
|
}
|
||||||
|
splittedWorks.forEach(w -> {
|
||||||
|
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||||
|
downloaded.setOrcidId(orcidId);
|
||||||
|
downloaded.setLastModifiedDate(lastModifiedDate);
|
||||||
|
downloaded.setStatusCode(Integer.parseInt(statusCode));
|
||||||
|
downloaded.setErrorMessage(errorMessage);
|
||||||
|
try {
|
||||||
|
downloaded
|
||||||
|
.setCompressedData(
|
||||||
|
ArgumentApplicationParser
|
||||||
|
.compressArgument(w));
|
||||||
|
} catch (Throwable t) {
|
||||||
|
downloaded.setStatusCode(-11);
|
||||||
|
downloaded.setErrorMessage(t.getMessage());
|
||||||
|
}
|
||||||
|
splittedDownloadedWorks.add(downloaded.toTuple2());
|
||||||
|
downloadedRecordsAcc.add(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
return splittedDownloadedWorks.iterator();
|
||||||
|
};
|
||||||
|
|
||||||
updatedAuthorsRDD
|
updatedAuthorsRDD
|
||||||
.flatMap(retrieveWorkUrlFunction)
|
.flatMap(retrieveWorkUrlFunction)
|
||||||
.repartition(100)
|
.repartition(100)
|
||||||
.map(downloadWorkFunction)
|
.map(downloadWorksFunction)
|
||||||
.mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
|
.flatMap(splitWorksFunction)
|
||||||
|
.mapToPair(w -> new Tuple2<>(new Text(w._1()), new Text(w._2())))
|
||||||
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
|
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
|
||||||
|
|
||||||
logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value());
|
logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value());
|
||||||
logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value());
|
logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value());
|
||||||
logger.info("parsedWorksAcc: {}", parsedWorksAcc.value());
|
logger.info("parsedWorksAcc: {}", parsedWorksAcc.value());
|
||||||
logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value());
|
logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value());
|
||||||
logger.info("maxModifiedWorksLimitAcc: {}", maxModifiedWorksLimitAcc.value());
|
|
||||||
logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value());
|
logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value());
|
||||||
logger.info("errorLoadingJsonFoundAcc: {}", errorLoadingJsonFoundAcc.value());
|
|
||||||
logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value());
|
|
||||||
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
|
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
|
||||||
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
|
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
|
||||||
logger.info("errorsAcc: {}", errorsAcc.value());
|
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -83,8 +83,6 @@ public class MultiAttemptsHttpConnector {
|
||||||
throw new CollectorException(msg);
|
throw new CollectorException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
|
|
||||||
|
|
||||||
InputStream input = null;
|
InputStream input = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -104,9 +102,9 @@ public class MultiAttemptsHttpConnector {
|
||||||
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
|
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (log.isDebugEnabled()) {
|
// if (log.isDebugEnabled()) {
|
||||||
logHeaderFields(urlConn);
|
// logHeaderFields(urlConn);
|
||||||
}
|
// }
|
||||||
|
|
||||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||||
if (is2xx(urlConn.getResponseCode())) {
|
if (is2xx(urlConn.getResponseCode())) {
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.xml;
|
package eu.dnetlib.doiboost.orcid.xml;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
@ -34,6 +38,33 @@ public class XMLRecordParser {
|
||||||
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
||||||
private static final String NS_HISTORY = "history";
|
private static final String NS_HISTORY = "history";
|
||||||
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
|
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
|
||||||
|
private static final String NS_BULK_URL = "http://www.orcid.org/ns/bulk";
|
||||||
|
private static final String NS_BULK = "bulk";
|
||||||
|
|
||||||
|
private static final String namespaceList = " xmlns:internal=\"http://www.orcid.org/ns/internal\"\n" +
|
||||||
|
" xmlns:education=\"http://www.orcid.org/ns/education\"\n" +
|
||||||
|
" xmlns:distinction=\"http://www.orcid.org/ns/distinction\"\n" +
|
||||||
|
" xmlns:deprecated=\"http://www.orcid.org/ns/deprecated\"\n" +
|
||||||
|
" xmlns:other-name=\"http://www.orcid.org/ns/other-name\"\n" +
|
||||||
|
" xmlns:membership=\"http://www.orcid.org/ns/membership\"\n" +
|
||||||
|
" xmlns:error=\"http://www.orcid.org/ns/error\" xmlns:common=\"http://www.orcid.org/ns/common\"\n" +
|
||||||
|
" xmlns:record=\"http://www.orcid.org/ns/record\"\n" +
|
||||||
|
" xmlns:personal-details=\"http://www.orcid.org/ns/personal-details\"\n" +
|
||||||
|
" xmlns:keyword=\"http://www.orcid.org/ns/keyword\" xmlns:email=\"http://www.orcid.org/ns/email\"\n" +
|
||||||
|
" xmlns:external-identifier=\"http://www.orcid.org/ns/external-identifier\"\n" +
|
||||||
|
" xmlns:funding=\"http://www.orcid.org/ns/funding\"\n" +
|
||||||
|
" xmlns:preferences=\"http://www.orcid.org/ns/preferences\"\n" +
|
||||||
|
" xmlns:address=\"http://www.orcid.org/ns/address\"\n" +
|
||||||
|
" xmlns:invited-position=\"http://www.orcid.org/ns/invited-position\"\n" +
|
||||||
|
" xmlns:work=\"http://www.orcid.org/ns/work\" xmlns:history=\"http://www.orcid.org/ns/history\"\n" +
|
||||||
|
" xmlns:employment=\"http://www.orcid.org/ns/employment\"\n" +
|
||||||
|
" xmlns:qualification=\"http://www.orcid.org/ns/qualification\"\n" +
|
||||||
|
" xmlns:service=\"http://www.orcid.org/ns/service\" xmlns:person=\"http://www.orcid.org/ns/person\"\n" +
|
||||||
|
" xmlns:activities=\"http://www.orcid.org/ns/activities\"\n" +
|
||||||
|
" xmlns:researcher-url=\"http://www.orcid.org/ns/researcher-url\"\n" +
|
||||||
|
" xmlns:peer-review=\"http://www.orcid.org/ns/peer-review\"\n" +
|
||||||
|
" xmlns:bulk=\"http://www.orcid.org/ns/bulk\"\n" +
|
||||||
|
" xmlns:research-resource=\"http://www.orcid.org/ns/research-resource\"";
|
||||||
|
|
||||||
private static final String NS_ERROR = "error";
|
private static final String NS_ERROR = "error";
|
||||||
|
|
||||||
|
@ -307,4 +338,65 @@ public class XMLRecordParser {
|
||||||
}
|
}
|
||||||
return authorHistory;
|
return authorHistory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<String> splitWorks(String orcidId, byte[] bytes)
|
||||||
|
throws ParseException, XPathParseException, NavException, XPathEvalException, VtdException, ModifyException,
|
||||||
|
IOException, TranscodeException {
|
||||||
|
|
||||||
|
final VTDGen vg = new VTDGen();
|
||||||
|
vg.setDoc(bytes);
|
||||||
|
vg.parse(true);
|
||||||
|
final VTDNav vn = vg.getNav();
|
||||||
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
|
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_BULK, NS_BULK_URL);
|
||||||
|
|
||||||
|
List<String> works = new ArrayList<>();
|
||||||
|
try {
|
||||||
|
ap.selectXPath("//work:work");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||||
|
long l = vn.getElementFragment();
|
||||||
|
String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>";
|
||||||
|
bos.write(xmlHeader.getBytes(StandardCharsets.UTF_8));
|
||||||
|
bos.write(vn.getXML().getBytes(), (int) l, (int) (l >> 32));
|
||||||
|
works.add(bos.toString());
|
||||||
|
bos.close();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new VtdException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<VTDGen> vgModifiers = Arrays.asList(new VTDGen());
|
||||||
|
List<XMLModifier> xmModifiers = Arrays.asList(new XMLModifier());
|
||||||
|
List<ByteArrayOutputStream> buffer = Arrays.asList(new ByteArrayOutputStream());
|
||||||
|
List<String> updatedWorks = works.stream().map(work -> {
|
||||||
|
vgModifiers.get(0).setDoc(work.getBytes());
|
||||||
|
try {
|
||||||
|
vgModifiers.get(0).parse(false);
|
||||||
|
final VTDNav vnModifier = vgModifiers.get(0).getNav();
|
||||||
|
xmModifiers.get(0).bind(vnModifier);
|
||||||
|
vnModifier.toElement(VTDNav.ROOT);
|
||||||
|
int attr = vnModifier.getAttrVal("put-code");
|
||||||
|
if (attr > -1) {
|
||||||
|
xmModifiers
|
||||||
|
.get(0)
|
||||||
|
.insertAttribute(
|
||||||
|
" path=\"/" + orcidId + "/work/" + vnModifier.toNormalizedString(attr) + "\""
|
||||||
|
+ " " + namespaceList);
|
||||||
|
}
|
||||||
|
buffer.set(0, new ByteArrayOutputStream());
|
||||||
|
xmModifiers.get(0).output(buffer.get(0));
|
||||||
|
buffer.get(0).close();
|
||||||
|
return buffer.get(0).toString();
|
||||||
|
} catch (NavException | ModifyException | IOException | TranscodeException | ParseException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}).collect(Collectors.toList());
|
||||||
|
|
||||||
|
return updatedWorks;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -161,13 +161,11 @@ public class OrcidClientTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
void testReadBase64CompressedRecord() throws Exception {
|
void testReadBase64CompressedWork() throws Exception {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
|
||||||
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
|
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
|
||||||
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
|
|
||||||
assertEquals(recordFromSeqFile, downloadedRecord);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -337,7 +335,7 @@ public class OrcidClientTest {
|
||||||
@Ignore
|
@Ignore
|
||||||
void testUpdatedRecord() throws Exception {
|
void testUpdatedRecord() throws Exception {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
|
||||||
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
logToFile(testPath, "\n\nrecord updated \n\n" + record);
|
logToFile(testPath, "\n\nrecord updated \n\n" + record);
|
||||||
}
|
}
|
||||||
|
|
|
@ -108,4 +108,12 @@ public class XMLRecordParserTest {
|
||||||
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
|
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
|
||||||
OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
|
OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWorksSplit() throws Exception {
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
this.getClass().getResourceAsStream("multiple_downloaded_works.xml"));
|
||||||
|
XMLRecordParser.splitWorks("0000-0001-7291-3210", xml.getBytes());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
H4sIAAAAAAAAAN1Y23LbNhB971dg+NAnkiKpSJZUS2luTerESSd22pm+QSQkISEBFiAlqxn9exe8CaRExkmc4Uw9Y9rEnrO72MUCC14+votCtCVCUs7mhms7BiLM5wFl67nx4fY3a2IgmWAW4JAzMjf2RBqPF5c7Lj7N1APFONnMjYEDPxb8utaFN3Gt8dAZD5R8MHWHk+nF1DUQAlNMzihLiGA4nBubJIlng8Fut7O58GkAz/WAyUGJMH5CqGSRIPVxkjnZRqsgOi+gMqHM72ZqoBqXxIKAShJ0UCuMzuTJhgiL4Yi0M48YnRmRaAnZ2NC4nXnE1CIkBBcd0VFio8D6PIq6ApLLde0wSS464pDLdUYMLnLIohWQBNNQtnObSF3LJ7LfdRouAOXMSAQaOqKgxLWo3eVrzaIBYQldUdIVw1OwrmuVsrxu2vgFoBYlQVZEQMmRrgAdQToXB4EgsoNXAHQOZVsKi9WKuaTdRdFE6lpUZbczlbTMxwZKi4t9O7gA1HISxSHfRxDirkSWGJ35T4pDSMuXdooaTOdLIrbU7yjaAmDU1viXVnYtZ7DLQFxpV7qPmHoFSoKFrzaNVHQs8TquXpEwJsiWkl2XyxVI5y7TsCPjSnrOV1AkeSq6InoCNVCcJhYcQUA6Hh5bKumShjSBpRSny5D6xiIzqH4u8/1q5guidmIrgOfCczzXcoaWN711vdnQm7mPbGfs/X05OIc+0RVimVgRHIRQ5UeNnuWMLce9dUDdaOY59tgdHjWe4ZzozSd5HD+VWX5IYV3DJlNH6chU0IWKqISQHsOZE6uz2LNG04lnTaaTYeWiIrZqVWf5ooudAVrpGy6TReVNRcqG6/Md3GvCjbCoo3Jx4/M4lchCL0KpFqlo6spQZ9VgCdWrKt7igq6p+uN/fYzPNDrfENxz7IcO7n3m2xqbLIxXXG5SjJ7idL1pV1uPeCMfmiDrGROahC35yUXPOHR/UcwFFnskU9hutziEnjSIOfSFcoaeMFQ0iMoJkEG5rVJJ1KigTFIfxaCDMoLWIeURRoKs4ZBR6pI02FcONly5HJxzMPf6I8xFnfu58C1JBbfeQZsc8vW+4NUhDb5Pk8zbxsRrMivZx2SxpMuE3BU666IuLsQoJYtfMSTGD8nnLGOe416YmTtojj7/8LgezCIEylo9RAdzD3u8Glc+HcwtD9Mo88qdHkyWqnZWvcFLjNdEZhLvYmq53sQ5mDhNNlzkk4BLyN5EtzaCKwl6gxkx0ZP85SlMnoTSRB+Kd56uViQx0Yv8/SUPgwgzE90UZHBpr95e2MXIb1yQDPHWfp2P/IH9T0SY6L19VSgVnFHpq7HC7DWEB6Ztoiu7MHSzoRsTPbOtQu2zDUDwOo1iHGITXeejr6COcBhWc3nJkwSLgCvrL/Oh5xseYkGB86rg8NUqc/BNqRln4XhaRgCyrhzJ2RzeMvT7asJ+Ji7YVxBLqch/ltNPQxzQysO/sICe00Svy4ldc/aRKPHh0Fyg+fpr1tLpsi82AbWcy4Ip1mxZfrWVXu2d2Ymfm6ofqzpKLbKFWmFViWcjp1tTu7pSldbpy/PGNET7pq2B8hoOOK28OBHeS00eadexXWc6HDCScuYPGL9znYuzmhuZ6VLNuIigMf6XBCgRGCo+68ATkRLjKwwetdzPqiBhlgl1n11IEq7Oaq2hzp93rRn5vpQRGjxIyjxLerZjTUbO0L2YjkfjRz8yX/e09n9LFpWSPUyBjbzhaDIeI/jHm4zcH1tcYMxS1h4+RzFsrxZ/2DSdk8rTPRRunwvt1iezzt0G4YCyHRx1xTcjG3CPocjmp0v2ZxzFv6gZMCJ+fz6/fju5fffk/Y3Wb4cnnRZX3coyTbhobtxN+Zlo5hBBAprkbe2x4SiPNE3YCFm3/m8yXzY4vRjXGqp+7B8buF7saw1jP8nXG9RePKg1xL14oDfg/SxCveHvxYPaBaMXD7QLTS/2Ty5QvXihXdh62o70C2IvLugX0n5ycLwA97QSywt3TydyccHvJ/vaB4W+DsTyA0Yv9rUPJj0dx9UHml7s6x+E+jkKyw9Q32P9VFZcFAqBeiz+A4MY5OQYIQAA
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,57 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<bulk:bulk xmlns:internal="http://www.orcid.org/ns/internal" xmlns:education="http://www.orcid.org/ns/education" xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:deprecated="http://www.orcid.org/ns/deprecated" xmlns:other-name="http://www.orcid.org/ns/other-name" xmlns:membership="http://www.orcid.org/ns/membership" xmlns:error="http://www.orcid.org/ns/error" xmlns:common="http://www.orcid.org/ns/common" xmlns:record="http://www.orcid.org/ns/record" xmlns:personal-details="http://www.orcid.org/ns/personal-details" xmlns:keyword="http://www.orcid.org/ns/keyword" xmlns:email="http://www.orcid.org/ns/email" xmlns:external-identifier="http://www.orcid.org/ns/external-identifier" xmlns:funding="http://www.orcid.org/ns/funding" xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:address="http://www.orcid.org/ns/address" xmlns:invited-position="http://www.orcid.org/ns/invited-position" xmlns:work="http://www.orcid.org/ns/work" xmlns:history="http://www.orcid.org/ns/history" xmlns:employment="http://www.orcid.org/ns/employment" xmlns:qualification="http://www.orcid.org/ns/qualification" xmlns:service="http://www.orcid.org/ns/service" xmlns:person="http://www.orcid.org/ns/person" xmlns:activities="http://www.orcid.org/ns/activities" xmlns:researcher-url="http://www.orcid.org/ns/researcher-url" xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:research-resource="http://www.orcid.org/ns/research-resource">
|
||||||
|
<work:work put-code="16639612" visibility="public">
|
||||||
|
<common:created-date>2015-05-23T18:56:52.486Z</common:created-date>
|
||||||
|
<common:last-modified-date>2017-02-28T08:22:12.454Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
|
||||||
|
<common:path>0000-0001-7291-3210</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Paolo Manghi</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>The Query Language TQL</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:journal-title>5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002</work:journal-title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>bibtex</work:citation-type>
|
||||||
|
<work:citation-value>@inproceedings{Conforti2002, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002}, Ee= {http://www.db.ucsd.edu/webdb2002/papers/43.pdf}, Pages= {13-18}, Title= {The Query Language TQL}, Year= {2002}}
|
||||||
|
|
||||||
|
</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>conference-paper</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2002</common:year>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids/>
|
||||||
|
</work:work>
|
||||||
|
<work:work put-code="16639628" visibility="public">
|
||||||
|
<common:created-date>2015-05-23T18:58:18.492Z</common:created-date>
|
||||||
|
<common:last-modified-date>2017-02-28T08:22:12.455Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
|
||||||
|
<common:path>0000-0001-7291-3210</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Paolo Manghi</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>The Query Language TQL - Demo Presentation</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:journal-title>X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)</work:journal-title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>bibtex</work:citation-type>
|
||||||
|
<work:citation-value>@inproceedings{Conforti2002Demo, Address= {Portoferraio, Italy}, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)}, Month= {June}, Pages= {427-431}, Title= {The Query Language TQL - Demo Presentation}, Year= {2002}}
|
||||||
|
|
||||||
|
</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>conference-paper</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2002</common:year>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids/>
|
||||||
|
</work:work>
|
||||||
|
</bulk:bulk>
|
|
@ -7,5 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||||
# A1 uses PatternLayout.
|
# A1 uses PatternLayout.
|
||||||
log4j.logger.org = ERROR
|
log4j.logger.org = ERROR
|
||||||
log4j.logger.eu.dnetlib = DEBUG
|
log4j.logger.eu.dnetlib = DEBUG
|
||||||
|
log4j.logger.eu.dnetlib.doiboost.orcid = INFO
|
||||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
Loading…
Reference in New Issue