This commit is contained in:
Claudio Atzori 2022-09-16 15:56:28 +02:00
commit dbb567251a
14 changed files with 829 additions and 45 deletions

View File

@ -3,6 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.*;
@ -13,6 +15,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
@ -20,6 +23,7 @@ import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
@ -42,6 +46,7 @@ public class SparkDownloadOrcidWorks {
public static final String ORCID_XML_DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
public static final DateTimeFormatter ORCID_XML_DATETIMEFORMATTER = DateTimeFormatter
.ofPattern(ORCID_XML_DATETIME_FORMAT);
public static final String DOWNLOAD_WORKS_REQUEST_SEPARATOR = ",";
public static void main(String[] args) throws Exception {
@ -56,7 +61,6 @@ public class SparkDownloadOrcidWorks {
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath");
@ -69,32 +73,22 @@ public class SparkDownloadOrcidWorks {
isSparkSessionManaged,
spark -> {
final String lastUpdateValue = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdateValue: ", lastUpdateValue);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator updatedAuthorsAcc = spark.sparkContext().longAccumulator("updated_authors");
LongAccumulator parsedAuthorsAcc = spark.sparkContext().longAccumulator("parsed_authors");
LongAccumulator parsedWorksAcc = spark.sparkContext().longAccumulator("parsed_works");
LongAccumulator modifiedWorksAcc = spark.sparkContext().longAccumulator("modified_works");
LongAccumulator maxModifiedWorksLimitAcc = spark
.sparkContext()
.longAccumulator("max_modified_works_limit");
LongAccumulator errorCodeFoundAcc = spark.sparkContext().longAccumulator("error_code_found");
LongAccumulator errorLoadingJsonFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_json_found");
LongAccumulator errorLoadingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_loading_xml_found");
LongAccumulator errorParsingXMLFoundAcc = spark
.sparkContext()
.longAccumulator("error_parsing_xml_found");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorsAcc = spark.sparkContext().longAccumulator("errors");
JavaPairRDD<Text, Text> updatedAuthorsRDD = sc
.sequenceFile(workingPath + "downloads/updated_authors/*", Text.class, Text.class);
updatedAuthorsAcc.setValue(updatedAuthorsRDD.count());
long authorsCount = updatedAuthorsRDD.count();
updatedAuthorsAcc.setValue(authorsCount);
FlatMapFunction<Tuple2<Text, Text>, String> retrieveWorkUrlFunction = data -> {
String orcidId = data._1().toString();
@ -106,11 +100,10 @@ public class SparkDownloadOrcidWorks {
if (statusCode.equals("200")) {
String compressedData = getJsonValue(jElement, "compressedData");
if (StringUtils.isEmpty(compressedData)) {
errorLoadingJsonFoundAcc.add(1);
} else {
String authorSummary = ArgumentApplicationParser.decompressValue(compressedData);
if (StringUtils.isEmpty(authorSummary)) {
errorLoadingXMLFoundAcc.add(1);
} else {
try {
workIdLastModifiedDate = XMLRecordParser
@ -125,22 +118,38 @@ public class SparkDownloadOrcidWorks {
errorCodeFoundAcc.add(1);
}
parsedAuthorsAcc.add(1);
workIdLastModifiedDate.forEach((k, v) -> {
parsedWorksAcc.add(1);
if (isModified(orcidId, v, lastUpdateValue)) {
modifiedWorksAcc.add(1);
workIds.add(orcidId.concat("/work/").concat(k));
workIds.add(k);
}
});
if (workIdLastModifiedDate.size() > 50) {
maxModifiedWorksLimitAcc.add(1);
if (workIds.isEmpty()) {
return new ArrayList<String>().iterator();
}
return workIds.iterator();
List<String> worksDownloadUrls = new ArrayList<>();
// Creation of url for reading multiple works (up to 100) with ORCID API
// see this https://github.com/ORCID/ORCID-Source/blob/development/orcid-api-web/tutorial/works.md
List<List<String>> partitionedWorks = Lists.partition(workIds, 100);
partitionedWorks.stream().forEach(p -> {
String worksDownloadUrl = orcidId.concat("/works/");
final StringBuffer buffer = new StringBuffer(worksDownloadUrl);
p.forEach(id -> {
buffer.append(id).append(DOWNLOAD_WORKS_REQUEST_SEPARATOR);
});
String finalUrl = buffer.substring(0, buffer.lastIndexOf(DOWNLOAD_WORKS_REQUEST_SEPARATOR));
worksDownloadUrls.add(finalUrl);
});
return worksDownloadUrls.iterator();
};
Function<String, Tuple2<String, String>> downloadWorkFunction = data -> {
String relativeWorkUrl = data;
String orcidId = relativeWorkUrl.split("/")[0];
Function<String, Tuple2<String, String>> downloadWorksFunction = data -> {
String relativeWorksUrl = data;
String orcidId = relativeWorksUrl.split("/")[0];
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastUpdateValue);
@ -149,7 +158,7 @@ public class SparkDownloadOrcidWorks {
httpConnector.setAuthMethod(MultiAttemptsHttpConnector.BEARER);
httpConnector.setAcceptHeaderValue("application/vnd.orcid+xml");
httpConnector.setAuthToken(token);
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorkUrl;
String apiUrl = "https://api.orcid.org/v3.0/" + relativeWorksUrl;
DownloadsReport report = new DownloadsReport();
long startReq = System.currentTimeMillis();
boolean downloadCompleted = false;
@ -167,7 +176,6 @@ public class SparkDownloadOrcidWorks {
} else {
downloaded.setStatusCode(-4);
}
errorsAcc.add(1);
}
long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq;
@ -176,7 +184,6 @@ public class SparkDownloadOrcidWorks {
}
if (downloadCompleted) {
downloaded.setStatusCode(200);
downloadedRecordsAcc.add(1);
downloaded
.setCompressedData(
ArgumentApplicationParser
@ -185,24 +192,69 @@ public class SparkDownloadOrcidWorks {
return downloaded.toTuple2();
};
FlatMapFunction<Tuple2<String, String>, Tuple2<String, String>> splitWorksFunction = data -> {
List<Tuple2<String, String>> splittedDownloadedWorks = new ArrayList<>();
String jsonData = data._2().toString();
JsonElement jElement = new JsonParser().parse(jsonData);
String orcidId = data._1().toString();
String statusCode = getJsonValue(jElement, "statusCode");
String lastModifiedDate = getJsonValue(jElement, "lastModifiedDate");
String compressedData = getJsonValue(jElement, "compressedData");
String errorMessage = getJsonValue(jElement, "errorMessage");
String works = ArgumentApplicationParser.decompressValue(compressedData);
// split a single xml containing multiple works into multiple xml (a single work for each xml)
List<String> splittedWorks = null;
try {
splittedWorks = XMLRecordParser
.splitWorks(orcidId, works.getBytes(StandardCharsets.UTF_8));
} catch (Throwable t) {
final DownloadedRecordData errDownloaded = new DownloadedRecordData();
errDownloaded.setOrcidId(orcidId);
errDownloaded.setLastModifiedDate(lastModifiedDate);
errDownloaded.setStatusCode(-10);
errDownloaded.setErrorMessage(t.getMessage());
splittedDownloadedWorks.add(errDownloaded.toTuple2());
errorParsingXMLFoundAcc.add(1);
return splittedDownloadedWorks.iterator();
}
splittedWorks.forEach(w -> {
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastModifiedDate);
downloaded.setStatusCode(Integer.parseInt(statusCode));
downloaded.setErrorMessage(errorMessage);
try {
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(w));
} catch (Throwable t) {
downloaded.setStatusCode(-11);
downloaded.setErrorMessage(t.getMessage());
}
splittedDownloadedWorks.add(downloaded.toTuple2());
downloadedRecordsAcc.add(1);
});
return splittedDownloadedWorks.iterator();
};
updatedAuthorsRDD
.flatMap(retrieveWorkUrlFunction)
.repartition(100)
.map(downloadWorkFunction)
.mapToPair(t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
.map(downloadWorksFunction)
.flatMap(splitWorksFunction)
.mapToPair(w -> new Tuple2<>(new Text(w._1()), new Text(w._2())))
.saveAsTextFile(workingPath.concat(outputPath), GzipCodec.class);
logger.info("updatedAuthorsAcc: {}", updatedAuthorsAcc.value());
logger.info("parsedAuthorsAcc: {}", parsedAuthorsAcc.value());
logger.info("parsedWorksAcc: {}", parsedWorksAcc.value());
logger.info("modifiedWorksAcc: {}", modifiedWorksAcc.value());
logger.info("maxModifiedWorksLimitAcc: {}", maxModifiedWorksLimitAcc.value());
logger.info("errorCodeFoundAcc: {}", errorCodeFoundAcc.value());
logger.info("errorLoadingJsonFoundAcc: {}", errorLoadingJsonFoundAcc.value());
logger.info("errorLoadingXMLFoundAcc: {}", errorLoadingXMLFoundAcc.value());
logger.info("errorParsingXMLFoundAcc: {}", errorParsingXMLFoundAcc.value());
logger.info("downloadedRecordsAcc: {}", downloadedRecordsAcc.value());
logger.info("errorsAcc: {}", errorsAcc.value());
});
}

View File

@ -83,8 +83,6 @@ public class MultiAttemptsHttpConnector {
throw new CollectorException(msg);
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null;
try {
@ -104,9 +102,9 @@ public class MultiAttemptsHttpConnector {
urlConn.addRequestProperty(HttpHeaders.AUTHORIZATION, String.format("Bearer %s", getAuthToken()));
}
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
// if (log.isDebugEnabled()) {
// logHeaderFields(urlConn);
// }
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (is2xx(urlConn.getResponseCode())) {

View File

@ -1,7 +1,11 @@
package eu.dnetlib.doiboost.orcid.xml;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.mortbay.log.Log;
@ -34,6 +38,33 @@ public class XMLRecordParser {
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_HISTORY = "history";
private static final String NS_HISTORY_URL = "http://www.orcid.org/ns/history";
private static final String NS_BULK_URL = "http://www.orcid.org/ns/bulk";
private static final String NS_BULK = "bulk";
private static final String namespaceList = " xmlns:internal=\"http://www.orcid.org/ns/internal\"\n" +
" xmlns:education=\"http://www.orcid.org/ns/education\"\n" +
" xmlns:distinction=\"http://www.orcid.org/ns/distinction\"\n" +
" xmlns:deprecated=\"http://www.orcid.org/ns/deprecated\"\n" +
" xmlns:other-name=\"http://www.orcid.org/ns/other-name\"\n" +
" xmlns:membership=\"http://www.orcid.org/ns/membership\"\n" +
" xmlns:error=\"http://www.orcid.org/ns/error\" xmlns:common=\"http://www.orcid.org/ns/common\"\n" +
" xmlns:record=\"http://www.orcid.org/ns/record\"\n" +
" xmlns:personal-details=\"http://www.orcid.org/ns/personal-details\"\n" +
" xmlns:keyword=\"http://www.orcid.org/ns/keyword\" xmlns:email=\"http://www.orcid.org/ns/email\"\n" +
" xmlns:external-identifier=\"http://www.orcid.org/ns/external-identifier\"\n" +
" xmlns:funding=\"http://www.orcid.org/ns/funding\"\n" +
" xmlns:preferences=\"http://www.orcid.org/ns/preferences\"\n" +
" xmlns:address=\"http://www.orcid.org/ns/address\"\n" +
" xmlns:invited-position=\"http://www.orcid.org/ns/invited-position\"\n" +
" xmlns:work=\"http://www.orcid.org/ns/work\" xmlns:history=\"http://www.orcid.org/ns/history\"\n" +
" xmlns:employment=\"http://www.orcid.org/ns/employment\"\n" +
" xmlns:qualification=\"http://www.orcid.org/ns/qualification\"\n" +
" xmlns:service=\"http://www.orcid.org/ns/service\" xmlns:person=\"http://www.orcid.org/ns/person\"\n" +
" xmlns:activities=\"http://www.orcid.org/ns/activities\"\n" +
" xmlns:researcher-url=\"http://www.orcid.org/ns/researcher-url\"\n" +
" xmlns:peer-review=\"http://www.orcid.org/ns/peer-review\"\n" +
" xmlns:bulk=\"http://www.orcid.org/ns/bulk\"\n" +
" xmlns:research-resource=\"http://www.orcid.org/ns/research-resource\"";
private static final String NS_ERROR = "error";
@ -307,4 +338,65 @@ public class XMLRecordParser {
}
return authorHistory;
}
public static List<String> splitWorks(String orcidId, byte[] bytes)
throws ParseException, XPathParseException, NavException, XPathEvalException, VtdException, ModifyException,
IOException, TranscodeException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
ap.declareXPathNameSpace(NS_BULK, NS_BULK_URL);
List<String> works = new ArrayList<>();
try {
ap.selectXPath("//work:work");
while (ap.evalXPath() != -1) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
long l = vn.getElementFragment();
String xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>";
bos.write(xmlHeader.getBytes(StandardCharsets.UTF_8));
bos.write(vn.getXML().getBytes(), (int) l, (int) (l >> 32));
works.add(bos.toString());
bos.close();
}
} catch (Exception e) {
throw new VtdException(e);
}
List<VTDGen> vgModifiers = Arrays.asList(new VTDGen());
List<XMLModifier> xmModifiers = Arrays.asList(new XMLModifier());
List<ByteArrayOutputStream> buffer = Arrays.asList(new ByteArrayOutputStream());
List<String> updatedWorks = works.stream().map(work -> {
vgModifiers.get(0).setDoc(work.getBytes());
try {
vgModifiers.get(0).parse(false);
final VTDNav vnModifier = vgModifiers.get(0).getNav();
xmModifiers.get(0).bind(vnModifier);
vnModifier.toElement(VTDNav.ROOT);
int attr = vnModifier.getAttrVal("put-code");
if (attr > -1) {
xmModifiers
.get(0)
.insertAttribute(
" path=\"/" + orcidId + "/work/" + vnModifier.toNormalizedString(attr) + "\""
+ " " + namespaceList);
}
buffer.set(0, new ByteArrayOutputStream());
xmModifiers.get(0).output(buffer.get(0));
buffer.get(0).close();
return buffer.get(0).toString();
} catch (NavException | ModifyException | IOException | TranscodeException | ParseException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}).collect(Collectors.toList());
return updatedWorks;
}
}

View File

@ -161,13 +161,11 @@ public class OrcidClientTest {
@Test
@Disabled
void testReadBase64CompressedRecord() throws Exception {
void testReadBase64CompressedWork() throws Exception {
final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\ndownloaded \n\n" + recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161", REQUEST_TYPE_RECORD);
assertEquals(recordFromSeqFile, downloadedRecord);
}
@Test
@ -337,7 +335,7 @@ public class OrcidClientTest {
@Ignore
void testUpdatedRecord() throws Exception {
final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
final String record = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
logToFile(testPath, "\n\nrecord updated \n\n" + record);
}

View File

@ -108,4 +108,12 @@ public class XMLRecordParserTest {
work.setBase64CompressData(ArgumentApplicationParser.compressArgument(xml));
OrcidClientTest.logToFile(testPath, JsonWriter.create(work));
}
@Test
void testWorksSplit() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("multiple_downloaded_works.xml"));
XMLRecordParser.splitWorks("0000-0001-7291-3210", xml.getBytes());
}
}

View File

@ -0,0 +1 @@
H4sIAAAAAAAAAN1Y23LbNhB971dg+NAnkiKpSJZUS2luTerESSd22pm+QSQkISEBFiAlqxn9exe8CaRExkmc4Uw9Y9rEnrO72MUCC14+votCtCVCUs7mhms7BiLM5wFl67nx4fY3a2IgmWAW4JAzMjf2RBqPF5c7Lj7N1APFONnMjYEDPxb8utaFN3Gt8dAZD5R8MHWHk+nF1DUQAlNMzihLiGA4nBubJIlng8Fut7O58GkAz/WAyUGJMH5CqGSRIPVxkjnZRqsgOi+gMqHM72ZqoBqXxIKAShJ0UCuMzuTJhgiL4Yi0M48YnRmRaAnZ2NC4nXnE1CIkBBcd0VFio8D6PIq6ApLLde0wSS464pDLdUYMLnLIohWQBNNQtnObSF3LJ7LfdRouAOXMSAQaOqKgxLWo3eVrzaIBYQldUdIVw1OwrmuVsrxu2vgFoBYlQVZEQMmRrgAdQToXB4EgsoNXAHQOZVsKi9WKuaTdRdFE6lpUZbczlbTMxwZKi4t9O7gA1HISxSHfRxDirkSWGJ35T4pDSMuXdooaTOdLIrbU7yjaAmDU1viXVnYtZ7DLQFxpV7qPmHoFSoKFrzaNVHQs8TquXpEwJsiWkl2XyxVI5y7TsCPjSnrOV1AkeSq6InoCNVCcJhYcQUA6Hh5bKumShjSBpRSny5D6xiIzqH4u8/1q5guidmIrgOfCczzXcoaWN711vdnQm7mPbGfs/X05OIc+0RVimVgRHIRQ5UeNnuWMLce9dUDdaOY59tgdHjWe4ZzozSd5HD+VWX5IYV3DJlNH6chU0IWKqISQHsOZE6uz2LNG04lnTaaTYeWiIrZqVWf5ooudAVrpGy6TReVNRcqG6/Md3GvCjbCoo3Jx4/M4lchCL0KpFqlo6spQZ9VgCdWrKt7igq6p+uN/fYzPNDrfENxz7IcO7n3m2xqbLIxXXG5SjJ7idL1pV1uPeCMfmiDrGROahC35yUXPOHR/UcwFFnskU9hutziEnjSIOfSFcoaeMFQ0iMoJkEG5rVJJ1KigTFIfxaCDMoLWIeURRoKs4ZBR6pI02FcONly5HJxzMPf6I8xFnfu58C1JBbfeQZsc8vW+4NUhDb5Pk8zbxsRrMivZx2SxpMuE3BU666IuLsQoJYtfMSTGD8nnLGOe416YmTtojj7/8LgezCIEylo9RAdzD3u8Glc+HcwtD9Mo88qdHkyWqnZWvcFLjNdEZhLvYmq53sQ5mDhNNlzkk4BLyN5EtzaCKwl6gxkx0ZP85SlMnoTSRB+Kd56uViQx0Yv8/SUPgwgzE90UZHBpr95e2MXIb1yQDPHWfp2P/IH9T0SY6L19VSgVnFHpq7HC7DWEB6Ztoiu7MHSzoRsTPbOtQu2zDUDwOo1iHGITXeejr6COcBhWc3nJkwSLgCvrL/Oh5xseYkGB86rg8NUqc/BNqRln4XhaRgCyrhzJ2RzeMvT7asJ+Ji7YVxBLqch/ltNPQxzQysO/sICe00Svy4ldc/aRKPHh0Fyg+fpr1tLpsi82AbWcy4Ip1mxZfrWVXu2d2Ymfm6ofqzpKLbKFWmFViWcjp1tTu7pSldbpy/PGNET7pq2B8hoOOK28OBHeS00eadexXWc6HDCScuYPGL9znYuzmhuZ6VLNuIigMf6XBCgRGCo+68ATkRLjKwwetdzPqiBhlgl1n11IEq7Oaq2hzp93rRn5vpQRGjxIyjxLerZjTUbO0L2YjkfjRz8yX/e09n9LFpWSPUyBjbzhaDIeI/jHm4zcH1tcYMxS1h4+RzFsrxZ/2DSdk8rTPRRunwvt1iezzt0G4YCyHRx1xTcjG3CPocjmp0v2ZxzFv6gZMCJ+fz6/fju5fffk/Y3Wb4cnnRZX3coyTbhobtxN+Zlo5hBBAprkbe2x4SiPNE3YCFm3/m8yXzY4vRjXGqp+7B8buF7saw1jP8nXG9RePKg1xL14oDfg/SxCveHvxYPaBaMXD7QLTS/2Ty5QvXihXdh62o70C2IvLugX0n5ycLwA97QSywt3TydyccHvJ/vaB4W+DsTyA0Yv9rUPJj0dx9UHml7s6x+E+jkKyw9Q32P9VFZcFAqBeiz+A4MY5OQYIQAA

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<bulk:bulk xmlns:internal="http://www.orcid.org/ns/internal" xmlns:education="http://www.orcid.org/ns/education" xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:deprecated="http://www.orcid.org/ns/deprecated" xmlns:other-name="http://www.orcid.org/ns/other-name" xmlns:membership="http://www.orcid.org/ns/membership" xmlns:error="http://www.orcid.org/ns/error" xmlns:common="http://www.orcid.org/ns/common" xmlns:record="http://www.orcid.org/ns/record" xmlns:personal-details="http://www.orcid.org/ns/personal-details" xmlns:keyword="http://www.orcid.org/ns/keyword" xmlns:email="http://www.orcid.org/ns/email" xmlns:external-identifier="http://www.orcid.org/ns/external-identifier" xmlns:funding="http://www.orcid.org/ns/funding" xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:address="http://www.orcid.org/ns/address" xmlns:invited-position="http://www.orcid.org/ns/invited-position" xmlns:work="http://www.orcid.org/ns/work" xmlns:history="http://www.orcid.org/ns/history" xmlns:employment="http://www.orcid.org/ns/employment" xmlns:qualification="http://www.orcid.org/ns/qualification" xmlns:service="http://www.orcid.org/ns/service" xmlns:person="http://www.orcid.org/ns/person" xmlns:activities="http://www.orcid.org/ns/activities" xmlns:researcher-url="http://www.orcid.org/ns/researcher-url" xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:research-resource="http://www.orcid.org/ns/research-resource">
<work:work put-code="16639612" visibility="public">
<common:created-date>2015-05-23T18:56:52.486Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.454Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL</common:title>
</work:title>
<work:journal-title>5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {5th International Workshop on Web and Data Bases (WebDB02) in conjunction with ACM SIGMOD 2002}, Ee= {http://www.db.ucsd.edu/webdb2002/papers/43.pdf}, Pages= {13-18}, Title= {The Query Language TQL}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
<work:work put-code="16639628" visibility="public">
<common:created-date>2015-05-23T18:58:18.492Z</common:created-date>
<common:last-modified-date>2017-02-28T08:22:12.455Z</common:last-modified-date>
<common:source>
<common:source-orcid>
<common:uri>https://orcid.org/0000-0001-7291-3210</common:uri>
<common:path>0000-0001-7291-3210</common:path>
<common:host>orcid.org</common:host>
</common:source-orcid>
<common:source-name>Paolo Manghi</common:source-name>
</common:source>
<work:title>
<common:title>The Query Language TQL - Demo Presentation</common:title>
</work:title>
<work:journal-title>X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)</work:journal-title>
<work:citation>
<work:citation-type>bibtex</work:citation-type>
<work:citation-value>@inproceedings{Conforti2002Demo, Address= {Portoferraio, Italy}, Author= {Giovanni Conforti and Giorgio Ghelli and Antonio Albano and Dario Colazzo and Paolo Manghi and Carlo Sartiani}, Bibsource= {DBLP, http://dblp.uni-trier.de}, Booktitle= {X Convegno nazionale su Sistemi Evoluti per Basi di Dati (SEBD)}, Month= {June}, Pages= {427-431}, Title= {The Query Language TQL - Demo Presentation}, Year= {2002}}
</work:citation-value>
</work:citation>
<work:type>conference-paper</work:type>
<common:publication-date>
<common:year>2002</common:year>
</common:publication-date>
<common:external-ids/>
</work:work>
</bulk:bulk>

View File

@ -7,5 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.logger.org = ERROR
log4j.logger.eu.dnetlib = DEBUG
log4j.logger.eu.dnetlib.doiboost.orcid = INFO
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -25,7 +25,8 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "https://dx.doi.org/";
public static final String HTTP_DOI_PREIFX = "https://doi.org/";
public static final String HTTP_HANDLE_PREIFX = "http://hdl.handle.net/";
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
final boolean forceOrginalId) {
@ -172,10 +173,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
}
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) {
url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) {
url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim());
url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc
.selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) {
url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim());
}
if (!url.isEmpty()) {
instance.setUrl(new ArrayList<>());

View File

@ -935,6 +935,18 @@ class MappersTest {
System.out.println("***************");
}
@Test
void testRiunet() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertNotNull(p.getInstance().get(0).getUrl().get(0));
}
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils

View File

@ -0,0 +1,71 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="HTTP://www.openarchives.org/OAI/2.0/">
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:riunet.upv.es:10251/178464</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2022-05-10T09:12:14Z</datestamp>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">com_10251_3822</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">col_10251_169956</setSpec>
<dr:dateOfTransformation>2022-09-01T07:51:12.657Z</dr:dateOfTransformation>
<dri:objIdentifier>od______1560::8f7a139735f493882bb0f4abceb6e200</dri:objIdentifier>
<dri:recordIdentifier>od______1560::8f7a139735f493882bb0f4abceb6e200</dri:recordIdentifier>
<dri:dateOfCollection>2019-03-27T15:15:22.22Z</dri:dateOfCollection>
<oaf:datasourceprefix>riunet________</oaf:datasourceprefix>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="Handle">10251/178464</datacite:identifier>
<datacite:alternateIdentifiers/>
<datacite:relatedIdentifiers/>
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_8544">lecture</datacite:resourceType>
<datacite:rightsList>
<datacite:rights rightsURI=" http://creativecommons.org/licenses/by-nc/4.0/">http://creativecommons.org/licenses/by-nc/4.0/</datacite:rights>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:rightsList>
<datacite:titles>
<datacite:title>Discurso de inauguración del curso academico 1990-1991 de la Universitat Politècnica de València</datacite:title>
</datacite:titles>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">[ES] Discurso de Justo Nieto en el acto de inauguración del curso académico 1990-1991</datacite:description>
<datacite:description descriptionType="Abstract">[EN] Inaugural speech by Justo Nieto at the opening ceremony of the 1990-1991 academic year</datacite:description>
</datacite:descriptions>
<datacite:language>spa</datacite:language>
<datacite:formats>
<datacite:format>application/pdf</datacite:format>
<datacite:format>5055377</datacite:format>
</datacite:formats>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>Nieto Nieto, Justo</datacite:creatorName>
</datacite:creator>
</datacite:creators>
<datacite:contributors/>
<datacite:dates>
<datacite:date dateType="Issued">1991</datacite:date>
</datacite:dates>
<datacite:subjects>
<datacite:subject>Justo Nieto Nieto (Discursos)</datacite:subject>
<datacite:subject>Universitat Politècnica de València (UPV)</datacite:subject>
<datacite:subject>Presentación inaugural</datacite:subject>
<datacite:subject>Curso académico 1990-91</datacite:subject>
<datacite:subject>Discurso inaugural</datacite:subject>
<datacite:subject>Inaugural speech</datacite:subject>
<datacite:subject>Inaugural presentation</datacite:subject>
</datacite:subjects>
</datacite:resource>
<oaf:identifier identifierType="Handle">10251/178464</oaf:identifier>
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
<oaf:dateAccepted>1991-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language>spa</oaf:language>
<oaf:hostedBy name="RiuNet" id="opendoar____::1560"/>
<oaf:collectedFrom name="RiuNet" id="opendoar____::1560"/>
</metadata>
</record>

View File

@ -81,6 +81,21 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
@Test
public void testRiunet() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
final Publication p = load("riunet.json", Publication.class);
final JoinedEntity je = new JoinedEntity<>(p);
final String record = xmlRecordFactory.build(je);
assertNotNull(record);
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException {
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml"));
@ -114,6 +129,8 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
@Test
void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -0,0 +1,470 @@
{
"collectedfrom": [
{
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
}
],
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
},
"lastupdatetimestamp": 1662543204165,
"id": "50|od______1560::8f7a139735f493882bb0f4abceb6e200",
"originalId": [
"50|od______1560::8f7a139735f493882bb0f4abceb6e200",
"oai:riunet.upv.es:10251/178464"
],
"pid": [
{
"value": "10251/178464",
"qualifier": {
"classid": "handle",
"classname": "Handle",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofcollection": "2019-03-27T15:15:22.22Z",
"dateoftransformation": "2022-09-01T07:51:12.657Z",
"extraInfo": [],
"oaiprovenance": null,
"processingchargeamount": null,
"processingchargecurrency": null,
"measures": null,
"author": [
{
"fullname": "Nieto Nieto, Justo",
"name": "Justo",
"surname": "Nieto Nieto",
"rank": 1,
"pid": [],
"affiliation": []
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"language": {
"classid": "spa",
"classname": "Spanish; Castilian",
"schemeid": "dnet:languages",
"schemename": "dnet:languages"
},
"country": [],
"subject": [
{
"value": "Justo Nieto Nieto (Discursos)",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Universitat Politècnica de València (UPV)",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Presentación inaugural",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Curso académico 1990-91",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Discurso inaugural",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Inaugural speech",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Inaugural presentation",
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"title": [
{
"value": "Discurso de inauguración del curso academico 1990-1991 de la Universitat Politècnica de València",
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"relevantdate": [
{
"value": "1991",
"qualifier": {
"classid": "Issued",
"classname": "Issued",
"schemeid": "dnet:dataCite_date",
"schemename": "dnet:dataCite_date"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"description": [
{
"value": "[ES] Discurso de Justo Nieto en el acto de inauguración del curso académico 1990-1991",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "[EN] Inaugural speech by Justo Nieto at the opening ceremony of the 1990-1991 academic year",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1991-01-01",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"publisher": null,
"embargoenddate": null,
"source": [],
"fulltext": [],
"format": [
{
"value": "application/pdf",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "5055377",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"contributor": [],
"resourcetype": {
"classid": "lecture",
"classname": "lecture",
"schemeid": "dnet:dataCite_resource",
"schemename": "dnet:dataCite_resource"
},
"coverage": [],
"bestaccessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"context": [],
"externalReference": [],
"instance": [
{
"license": null,
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes",
"openAccessRoute": null
},
"instancetype": {
"classid": "0038",
"classname": "Other literature type",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"hostedby": {
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
},
"url": null,
"distributionlocation": null,
"collectedfrom": {
"key": "10|opendoar____::3a20f62a0af1aa152670bab3c602feed",
"value": "RiuNet",
"dataInfo": null
},
"pid": [
{
"value": "10251/178464",
"qualifier": {
"classid": "handle",
"classname": "Handle",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"alternateIdentifier": [],
"dateofacceptance": {
"value": "1991-01-01",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"processingchargeamount": null,
"processingchargecurrency": null,
"refereed": {
"classid": "UNKNOWN",
"classname": "Unknown",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"measures": null
}
]
}