Merge branch 'handle_as_instance_urls' of https://code-repo.d4science.org/D-Net/dnet-hadoop into handle_as_instance_urls

This commit is contained in:
Alessia Bardi 2022-09-09 12:17:19 +02:00
commit f14107ad77
9 changed files with 248 additions and 42 deletions

View File

@ -3,6 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate; import java.time.LocalDate;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.util.*; import java.util.*;
@ -13,6 +15,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function;
@ -20,6 +23,7 @@ import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.gson.JsonElement; import com.google.gson.JsonElement;
import com.google.gson.JsonParser; import com.google.gson.JsonParser;
@ -42,6 +46,7 @@ public class SparkDownloadOrcidWorks {
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
.ofPattern(ORCID_XML_DATETIME_FORMAT); .ofPattern(ORCID_XML_DATETIME_FORMAT);
public static final String DOWNLOAD_WORKS_REQUEST_SEPARATOR = ",";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -56,7 +61,6 @@ public class SparkDownloadOrcidWorks {
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath"); final String workingPath = parser.get("workingPath");
logger.info("workingPath: {}", workingPath); logger.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
@ -69,32 +73,22 @@ public class SparkDownloadOrcidWorks {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt"); final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdateValue: ", lastUpdateValue);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors"); LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors"); LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works"); LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works"); LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
LongAccumulator maxModifiedWorksLimitAcc = spark
.sparkContext()
.longAccumulator("max_modified_works_limit");
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found"); LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
LongAccumulator errorLoadingJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_json_found");
LongAccumulator errorLoadingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_xml_found");
LongAccumulator errorParsingXMLFoundAcc = spark LongAccumulator errorParsingXMLFoundAcc = spark
.sparkContext() .sparkContext()
.longAccumulator("error_parsing_xml_found"); .longAccumulator("error_parsing_xml_found");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records"); LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class); .sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count()); long authorsCount = updatedAuthorsRDD.count();
updatedAuthorsAcc.setValue(authorsCount);
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> { FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
String orcidId = data._1().toString(); String orcidId = data._1().toString();
@ -106,11 +100,10 @@ public class SparkDownloadOrcidWorks {
if (statusCode.equals("200")) { if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData"); String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) { if (StringUtils.isEmpty(compressedData)) {
errorLoadingJsonFoundAcc.add(1);
} else { } else {
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData); String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
if (StringUtils.isEmpty(authorSummary)) { if (StringUtils.isEmpty(authorSummary)) {
errorLoadingXMLFoundAcc.add(1);
} else { } else {
try { try {
workIdLastModifiedDate = XMLRecordParser workIdLastModifiedDate = XMLRecordParser
@ -125,22 +118,38 @@ public class SparkDownloadOrcidWorks {
errorCodeFoundAcc.add(1); errorCodeFoundAcc.add(1);
} }
parsedAuthorsAcc.add(1); parsedAuthorsAcc.add(1);
workIdLastModifiedDate.forEach((k, v) -> { workIdLastModifiedDate.forEach((k, v) -> {
parsedWorksAcc.add(1); parsedWorksAcc.add(1);
if (isModified(orcidId, v, lastUpdateValue)) { if (isModified(orcidId, v, lastUpdateValue)) {
modifiedWorksAcc.add(1); modifiedWorksAcc.add(1);
workIds.add(orcidId.concat("/work/").concat(k)); workIds.add(k);
} }
}); });
if (workIdLastModifiedDate.size() > 50) { if (workIds.isEmpty()) {
maxModifiedWorksLimitAcc.add(1); return new ArrayList<String>().iterator();
} }
return workIds.iterator(); List<String> worksDownloadUrls = new ArrayList<>();
// Creation of url for reading multiple works (up to 100) with ORCID API
// see this https://github.com/ORCID/ORCID-Source/blob/development/orcid-api-web/tutorial/works.md
List<List<String>> partitionedWorks = Lists.partition(workIds, 100);
partitionedWorks.stream().forEach(p -> {
String worksDownloadUrl = orcidId.concat("/works/");
final StringBuffer buffer = new StringBuffer(worksDownloadUrl);
p.forEach(id -> {
buffer.append(id).append(DOWNLOAD_WORKS_REQUEST_SEPARATOR);
});
String finalUrl = buffer.substring(0, buffer.lastIndexOf(DOWNLOAD_WORKS_REQUEST_SEPARATOR));
worksDownloadUrls.add(finalUrl);
});
return worksDownloadUrls.iterator();
}; };
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> { Function<String, Tuple2<String, String>> downloadWorksFunction = data -> {
String relativeWorkUrl = data; String relativeWorksUrl = data;
String orcidId = relativeWorkUrl.split("/")[0]; String orcidId = relativeWorksUrl.split("/")[0];
final DownloadedRecordData downloaded = new DownloadedRecordData(); final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId); downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastUpdateValue); downloaded.setLastModifiedDate(lastUpdateValue);
@ -149,7 +158,7 @@ public class SparkDownloadOrcidWorks {
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER); httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml"); httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
httpConnector.setAuthToken(token); httpConnector.setAuthToken(token);
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl; String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorksUrl;
DownloadsReport report = new DownloadsReport(); DownloadsReport report = new DownloadsReport();
long startReq = System.currentTimeMillis(); long startReq = System.currentTimeMillis();
boolean downloadCompleted = false; boolean downloadCompleted = false;
@ -167,7 +176,6 @@ public class SparkDownloadOrcidWorks {
} else { } else {
downloaded.setStatusCode(-4); downloaded.setStatusCode(-4);
} }
errorsAcc.add(1);
} }
long endReq = System.currentTimeMillis(); long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq; long reqTime = endReq - startReq;
@ -176,7 +184,6 @@ public class SparkDownloadOrcidWorks {
} }
if (downloadCompleted) { if (downloadCompleted) {
downloaded.setStatusCode(200); downloaded.setStatusCode(200);
downloadedRecordsAcc.add(1);
downloaded downloaded
.setCompressedData( .setCompressedData(
ArgumentApplicationParser ArgumentApplicationParser
@ -185,24 +192,69 @@ public class SparkDownloadOrcidWorks {
return downloaded.toTuple2(); return downloaded.toTuple2();
}; };
FlatMapFunction<Tuple2<String, String>, Tuple2<String, String>> splitWorksFunction = data -> {
List<Tuple2<String, String>> splittedDownloadedWorks = new ArrayList<>();
String jsonData = data._2().toString();
JsonElement jElement = new JsonParser().parse(jsonData);
String orcidId = data._1().toString();
String statusCode = getJsonValue(jElement, "statusCode");
String lastModifiedDate = getJsonValue(jElement, "lastModifiedDate");
String compressedData = getJsonValue(jElement, "compressedData");
String errorMessage = getJsonValue(jElement, "errorMessage");
String works = ArgumentApplicationParser.decompressValue(compressedData);
// split a single xml containing multiple works into multiple xml (a single work for each xml)
List<String> splittedWorks = null;
try {
splittedWorks = XMLRecordParser
.splitWorks(orcidId, works.getBytes(StandardCharsets.UTF_8));
} catch (Throwable t) {
final DownloadedRecordData errDownloaded = new DownloadedRecordData();
errDownloaded.setOrcidId(orcidId);
errDownloaded.setLastModifiedDate(lastModifiedDate);
errDownloaded.setStatusCode(-10);
errDownloaded.setErrorMessage(t.getMessage());
splittedDownloadedWorks.add(errDownloaded.toTuple2());
errorParsingXMLFoundAcc.add(1);
return splittedDownloadedWorks.iterator();
}
splittedWorks.forEach(w -> {
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastModifiedDate);
downloaded.setStatusCode(Integer.parseInt(statusCode));
downloaded.setErrorMessage(errorMessage);
try {
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(w));
} catch (Throwable t) {
downloaded.setStatusCode(-11);
downloaded.setErrorMessage(t.getMessage());
}
splittedDownloadedWorks.add(downloaded.toTuple2());
downloadedRecordsAcc.add(1);
});
return splittedDownloadedWorks.iterator();
};
updatedAuthorsRDD updatedAuthorsRDD
.flatMap(retrieveWorkUrlFunction) .flatMap(retrieveWorkUrlFunction)
.repartition(100) .repartition(100)
.map(downloadWorkFunction) .map(downloadWorksFunction)
.mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) .flatMap(splitWorksFunction)
.mapToPair(w -> new Tuple2<>(new Text(w._1()), new Text(w._2())))
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class); .saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value()); logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value());
logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value()); logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value());
logger.info("parsedWorksAcc: {}", parsedWorksAcc.value()); logger.info("parsedWorksAcc: {}", parsedWorksAcc.value());
logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value()); logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value());
logger.info("maxModifiedWorksLimitAcc: {}", maxModifiedWorksLimitAcc.value());
logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value()); logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value());
logger.info("errorLoadingJsonFoundAcc: {}", errorLoadingJsonFoundAcc.value());
logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value());
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value()); logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value()); logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
logger.info("errorsAcc: {}", errorsAcc.value());
}); });
} }

View File

@ -83,8 +83,6 @@ public class MultiAttemptsHttpConnector {
throw new CollectorException(msg); throw new CollectorException(msg);
} }
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null; InputStream input = null;
try { try {
@ -104,9 +102,9 @@ public class MultiAttemptsHttpConnector {
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken())); urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
} }
if (log.isDebugEnabled()) { // if (log.isDebugEnabled()) {
logHeaderFields(urlConn); // logHeaderFields(urlConn);
} // }
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (is2xx(urlConn.getResponseCode())) { if (is2xx(urlConn.getResponseCode())) {

View File

@ -1,7 +1,11 @@
package eu.dnetlib.doiboost.orcid.xml; package eu.dnetlib.doiboost.orcid.xml;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.mortbay.log.Log; import org.mortbay.log.Log;
@ -34,6 +38,33 @@ public class XMLRecordParser {
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work"; private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_HISTORY = "history"; private static final String NS_HISTORY = "history";
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history"; private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
private static final String NS_BULK_URL = "http://www.orcid.org/ns/bulk";
private static final String NS_BULK = "bulk";
private static final String namespaceList = " xmlns:internal=\"http://www.orcid.org/ns/internal\"\n" +
" xmlns:education=\"http://www.orcid.org/ns/education\"\n" +
" xmlns:distinction=\"http://www.orcid.org/ns/distinction\"\n" +
" xmlns:deprecated=\"http://www.orcid.org/ns/deprecated\"\n" +
" xmlns:other-name=\"http://www.orcid.org/ns/other-name\"\n" +
" xmlns:membership=\"http://www.orcid.org/ns/membership\"\n" +
" xmlns:error=\"http://www.orcid.org/ns/error\" xmlns:common=\"http://www.orcid.org/ns/common\"\n" +
" xmlns:record=\"http://www.orcid.org/ns/record\"\n" +
" xmlns:personal-details=\"http://www.orcid.org/ns/personal-details\"\n" +
" xmlns:keyword=\"http://www.orcid.org/ns/keyword\" xmlns:email=\"http://www.orcid.org/ns/email\"\n" +
" xmlns:external-identifier=\"http://www.orcid.org/ns/external-identifier\"\n" +
" xmlns:funding=\"http://www.orcid.org/ns/funding\"\n" +
" xmlns:preferences=\"http://www.orcid.org/ns/preferences\"\n" +
" xmlns:address=\"http://www.orcid.org/ns/address\"\n" +
" xmlns:invited-position=\"http://www.orcid.org/ns/invited-position\"\n" +
" xmlns:work=\"http://www.orcid.org/ns/work\" xmlns:history=\"http://www.orcid.org/ns/history\"\n" +
" xmlns:employment=\"http://www.orcid.org/ns/employment\"\n" +
" xmlns:qualification=\"http://www.orcid.org/ns/qualification\"\n" +
" xmlns:service=\"http://www.orcid.org/ns/service\" xmlns:person=\"http://www.orcid.org/ns/person\"\n" +
" xmlns:activities=\"http://www.orcid.org/ns/activities\"\n" +
" xmlns:researcher-url=\"http://www.orcid.org/ns/researcher-url\"\n" +
" xmlns:peer-review=\"http://www.orcid.org/ns/peer-review\"\n" +
" xmlns:bulk=\"http://www.orcid.org/ns/bulk\"\n" +
" xmlns:research-resource=\"http://www.orcid.org/ns/research-resource\"";
private static final String NS_ERROR = "error"; private static final String NS_ERROR = "error";
@ -307,4 +338,65 @@ public class XMLRecordParser {
} }
return authorHistory; return authorHistory;
} }
public static List<String> splitWorks(String orcidId, byte[] bytes)
throws ParseException, XPathParseException, NavException, XPathEvalException, VtdException, ModifyException,
IOException, TranscodeException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
ap.declareXPathNameSpace(NS_BULK, NS_BULK_URL);
List<String> works = new ArrayList<>();
try {
ap.selectXPath("//work:work");
while (ap.evalXPath() != -1) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
long l = vn.getElementFragment();
String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>";
bos.write(xmlHeader.getBytes(StandardCharsets.UTF_8));
bos.write(vn.getXML().getBytes(), (int) l, (int) (l >> 32));
works.add(bos.toString());
bos.close();
}
} catch (Exception e) {
throw new VtdException(e);
}
List<VTDGen> vgModifiers = Arrays.asList(new VTDGen());
List<XMLModifier> xmModifiers = Arrays.asList(new XMLModifier());
List<ByteArrayOutputStream> buffer = Arrays.asList(new ByteArrayOutputStream());
List<String> updatedWorks = works.stream().map(work -> {
vgModifiers.get(0).setDoc(work.getBytes());
try {
vgModifiers.get(0).parse(false);
final VTDNav vnModifier = vgModifiers.get(0).getNav();
xmModifiers.get(0).bind(vnModifier);
vnModifier.toElement(VTDNav.ROOT);
int attr = vnModifier.getAttrVal("put-code");
if (attr > -1) {
xmModifiers
.get(0)
.insertAttribute(
" path=\"/" + orcidId + "/work/" + vnModifier.toNormalizedString(attr) + "\""
+ " " + namespaceList);
}
buffer.set(0, new ByteArrayOutputStream());
xmModifiers.get(0).output(buffer.get(0));
buffer.get(0).close();
return buffer.get(0).toString();
} catch (NavException | ModifyException | IOException | TranscodeException | ParseException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}).collect(Collectors.toList());
return updatedWorks;
}
} }

View File

@ -161,13 +161,11 @@ public class OrcidClientTest {
@Test @Test
@Disabled @Disabled
void testReadBase64CompressedRecord() throws Exception { void testReadBase64CompressedWork() throws Exception {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile); logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
assertEquals(recordFromSeqFile, downloadedRecord);
} }
@Test @Test
@ -337,7 +335,7 @@ public class OrcidClientTest {
@Ignore @Ignore
void testUpdatedRecord() throws Exception { void testUpdatedRecord() throws Exception {
final String base64CompressedRecord = IOUtils final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64")); .toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord); final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\nrecord updated \n\n" + record); logToFile(testPath, "\n\nrecord updated \n\n" + record);
} }

View File

@ -108,4 +108,12 @@ public class XMLRecordParserTest {
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml)); work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
OrcidClientTest.logToFile(testPath, JsonWriter.create(work)); OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
} }
@Test
void testWorksSplit() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("multiple_downloaded_works.xml"));
XMLRecordParser.splitWorks("0000-0001-7291-3210", xml.getBytes());
}
} }

View File

@ -0,0 +1 @@
H4sIAAAAAAAAAN1Y23LbNhB971dg+NAnkiKpSJZUS2luTerESSd22pm+QSQkISEBFiAlqxn9exe8CaRExkmc4Uw9Y9rEnrO72MUCC14+votCtCVCUs7mhms7BiLM5wFl67nx4fY3a2IgmWAW4JAzMjf2RBqPF5c7Lj7N1APFONnMjYEDPxb8utaFN3Gt8dAZD5R8MHWHk+nF1DUQAlNMzihLiGA4nBubJIlng8Fut7O58GkAz/WAyUGJMH5CqGSRIPVxkjnZRqsgOi+gMqHM72ZqoBqXxIKAShJ0UCuMzuTJhgiL4Yi0M48YnRmRaAnZ2NC4nXnE1CIkBBcd0VFio8D6PIq6ApLLde0wSS464pDLdUYMLnLIohWQBNNQtnObSF3LJ7LfdRouAOXMSAQaOqKgxLWo3eVrzaIBYQldUdIVw1OwrmuVsrxu2vgFoBYlQVZEQMmRrgAdQToXB4EgsoNXAHQOZVsKi9WKuaTdRdFE6lpUZbczlbTMxwZKi4t9O7gA1HISxSHfRxDirkSWGJ35T4pDSMuXdooaTOdLIrbU7yjaAmDU1viXVnYtZ7DLQFxpV7qPmHoFSoKFrzaNVHQs8TquXpEwJsiWkl2XyxVI5y7TsCPjSnrOV1AkeSq6InoCNVCcJhYcQUA6Hh5bKumShjSBpRSny5D6xiIzqH4u8/1q5guidmIrgOfCczzXcoaWN711vdnQm7mPbGfs/X05OIc+0RVimVgRHIRQ5UeNnuWMLce9dUDdaOY59tgdHjWe4ZzozSd5HD+VWX5IYV3DJlNH6chU0IWKqISQHsOZE6uz2LNG04lnTaaTYeWiIrZqVWf5ooudAVrpGy6TReVNRcqG6/Md3GvCjbCoo3Jx4/M4lchCL0KpFqlo6spQZ9VgCdWrKt7igq6p+uN/fYzPNDrfENxz7IcO7n3m2xqbLIxXXG5SjJ7idL1pV1uPeCMfmiDrGROahC35yUXPOHR/UcwFFnskU9hutziEnjSIOfSFcoaeMFQ0iMoJkEG5rVJJ1KigTFIfxaCDMoLWIeURRoKs4ZBR6pI02FcONly5HJxzMPf6I8xFnfu58C1JBbfeQZsc8vW+4NUhDb5Pk8zbxsRrMivZx2SxpMuE3BU666IuLsQoJYtfMSTGD8nnLGOe416YmTtojj7/8LgezCIEylo9RAdzD3u8Glc+HcwtD9Mo88qdHkyWqnZWvcFLjNdEZhLvYmq53sQ5mDhNNlzkk4BLyN5EtzaCKwl6gxkx0ZP85SlMnoTSRB+Kd56uViQx0Yv8/SUPgwgzE90UZHBpr95e2MXIb1yQDPHWfp2P/IH9T0SY6L19VSgVnFHpq7HC7DWEB6Ztoiu7MHSzoRsTPbOtQu2zDUDwOo1iHGITXeejr6COcBhWc3nJkwSLgCvrL/Oh5xseYkGB86rg8NUqc/BNqRln4XhaRgCyrhzJ2RzeMvT7asJ+Ji7YVxBLqch/ltNPQxzQysO/sICe00Svy4ldc/aRKPHh0Fyg+fpr1tLpsi82AbWcy4Ip1mxZfrWVXu2d2Ymfm6ofqzpKLbKFWmFViWcjp1tTu7pSldbpy/PGNET7pq2B8hoOOK28OBHeS00eadexXWc6HDCScuYPGL9znYuzmhuZ6VLNuIigMf6XBCgRGCo+68ATkRLjKwwetdzPqiBhlgl1n11IEq7Oaq2hzp93rRn5vpQRGjxIyjxLerZjTUbO0L2YjkfjRz8yX/e09n9LFpWSPUyBjbzhaDIeI/jHm4zcH1tcYMxS1h4+RzFsrxZ/2DSdk8rTPRRunwvt1iezzt0G4YCyHRx1xTcjG3CPocjmp0v2ZxzFv6gZMCJ+fz6/fju5fffk/Y3Wb4cnnRZX3coyTbhobtxN+Zlo5hBBAprkbe2x4SiPNE3YCFm3/m8yXzY4vRjXGqp+7B8buF7saw1jP8nXG9RePKg1xL14oDfg/SxCveHvxYPaBaMXD7QLTS/2Ty5QvXihXdh62o70C2IvLugX0n5ycLwA97QSywt3TydyccHvJ/vaB4W+DsTyA0Yv9rUPJj0dx9UHml7s6x+E+jkKyw9Q32P9VFZcFAqBeiz+A4MY5OQYIQAA

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<bulk:bulk xmlns:internal="http://www.orcid.org/ns/internal" xmlns:education="http://www.orcid.org/ns/education" xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:deprecated="http://www.orcid.org/ns/deprecated" xmlns:other-name="http://www.orcid.org/ns/other-name" xmlns:membership="http://www.orcid.org/ns/membership" xmlns:error="http://www.orcid.org/ns/error" xmlns:common="http://www.orcid.org/ns/common" xmlns:record="http://www.orcid.org/ns/record" xmlns:personal-details="http://www.orcid.org/ns/personal-details" xmlns:keyword="http://www.orcid.org/ns/keyword" xmlns:email="http://www.orcid.org/ns/email" xmlns:external-identifier="http://www.orcid.org/ns/external-identifier" xmlns:funding="http://www.orcid.org/ns/funding" xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:address="http://www.orcid.org/ns/address" xmlns:invited-position="http://www.orcid.org/ns/invited-position" xmlns:work="http://www.orcid.org/ns/work" xmlns:history="http://www.orcid.org/ns/history" xmlns:employment="http://www.orcid.org/ns/employment" xmlns:qualification="http://www.orcid.org/ns/qualification" xmlns:service="http://www.orcid.org/ns/service" xmlns:person="http://www.orcid.org/ns/person" xmlns:activities="http://www.orcid.org/ns/activities" xmlns:researcher-url="http://www.orcid.org/ns/researcher-url" xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:research-resource="http://www.orcid.org/ns/research-resource">
<work:work put-code="16639612" visibility="public">
<common:created-date>2015-05-23T18:56:52.486Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.454Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL</common:title>
</work:title>
<work:journal-title>5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002}, Ee= {http://www.db.ucsd.edu/webdb2002/papers/43.pdf}, Pages= {13-18}, Title= {The Query Language TQL}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
<work:work put-code="16639628" visibility="public">
<common:created-date>2015-05-23T18:58:18.492Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.455Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL - Demo Presentation</common:title>
</work:title>
<work:journal-title>X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002Demo, Address= {Portoferraio, Italy}, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)}, Month= {June}, Pages= {427-431}, Title= {The Query Language TQL - Demo Presentation}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
</bulk:bulk>

View File

@ -7,5 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout. # A1 uses PatternLayout.
log4j.logger.org = ERROR log4j.logger.org = ERROR
log4j.logger.eu.dnetlib = DEBUG log4j.logger.eu.dnetlib = DEBUG
log4j.logger.eu.dnetlib.doiboost.orcid = INFO
log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n