Merge pull request 'Update Gtr2 plugin' (#518) from beta-ukripublication into beta

Reviewed-on: #518
This commit is contained in:
Claudio Atzori 2024-12-20 10:11:34 +01:00
commit a6da42a2e8
2 changed files with 43 additions and 77 deletions

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dhp.collection.plugin.gtr2;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
@ -11,12 +10,17 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.slf4j.Logger;
@ -28,8 +32,6 @@ import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class Gtr2PublicationsIterator implements Iterator<String> {
public static final int PAGE_SIZE = 20;
private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
private final HttpConnector2 connector;
@ -42,7 +44,6 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
private int endPage;
private boolean incremental = false;
private LocalDate fromDate;
private final Map<String, String> cache = new HashMap<>();
private final Queue<String> queue = new LinkedList<>();
@ -88,7 +89,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
private void prepareNextElement() {
while ((this.currPage <= this.endPage) && this.queue.isEmpty()) {
log.debug("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
log.info("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
this.queue.addAll(fetchPage(this.currPage++));
}
this.nextElement = this.queue.poll();
@ -97,18 +98,17 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
private List<String> fetchPage(final int pageNumber) {
final List<String> res = new ArrayList<>();
try {
final Document doc = loadURL(cleanURL(this.baseUrl + "/outcomes/publications?p=" + pageNumber), 0);
if (this.endPage == Integer.MAX_VALUE) {
this.endPage = NumberUtils.toInt(doc.valueOf("/*/@*[local-name() = 'totalPages']"));
}
try {
final Document doc = loadURL(this.baseUrl + "/publication?page=" + pageNumber, 0);
for (final Object po : doc.selectNodes("//*[local-name() = 'publication']")) {
final Element mainEntity = (Element) ((Element) po).detach();
if (filterIncremental(mainEntity)) {
res.add(expandMainEntity(mainEntity));
final String publicationOverview = mainEntity.attributeValue("url");
res.add(loadURL(publicationOverview, -1).asXML());
} else {
log.debug("Skipped entity");
}
@ -122,34 +122,6 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
return res;
}
private void addLinkedEntities(final Element master, final String relType, final Element newRoot,
final Function<Document, Element> mapper) {
for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
final String href = ((Element) o).valueOf("@*[local-name()='href']");
if (relType.equals(rel) && StringUtils.isNotBlank(href)) {
final String cacheKey = relType + "#" + href;
if (this.cache.containsKey(cacheKey)) {
try {
log.debug(" * from cache (" + relType + "): " + href);
newRoot.add(DocumentHelper.parseText(this.cache.get(cacheKey)).getRootElement());
} catch (final DocumentException e) {
log.error("Error retrieving cache element: " + cacheKey, e);
throw new RuntimeException("Error retrieving cache element: " + cacheKey, e);
}
} else {
final Document doc = loadURL(cleanURL(href), 0);
final Element elem = mapper.apply(doc);
newRoot.add(elem);
this.cache.put(cacheKey, elem.asXML());
}
}
}
}
private boolean filterIncremental(final Element e) {
if (!this.incremental || isAfter(e.valueOf("@*[local-name() = 'created']"), this.fromDate)
|| isAfter(e.valueOf("@*[local-name() = 'updated']"), this.fromDate)) {
@ -158,40 +130,34 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
return false;
}
private String expandMainEntity(final Element mainEntity) {
final Element newRoot = DocumentHelper.createElement("doc");
newRoot.add(mainEntity);
addLinkedEntities(mainEntity, "PROJECT", newRoot, this::asProjectElement);
return DocumentHelper.createDocument(newRoot).asXML();
}
private Element asProjectElement(final Document doc) {
final Element newOrg = DocumentHelper.createElement("project");
newOrg.addElement("id").setText(doc.valueOf("/*/@*[local-name()='id']"));
newOrg
.addElement("code")
.setText(doc.valueOf("//*[local-name()='identifier' and @*[local-name()='type'] = 'RCUK']"));
newOrg.addElement("title").setText(doc.valueOf("//*[local-name()='title']"));
return newOrg;
}
private static String cleanURL(final String url) {
String cleaned = url;
if (cleaned.contains("gtr.gtr")) {
cleaned = cleaned.replace("gtr.gtr", "gtr");
}
if (cleaned.startsWith("http://")) {
cleaned = cleaned.replaceFirst("http://", "https://");
}
return cleaned;
}
private Document loadURL(final String cleanUrl, final int attempt) {
try {
log.debug(" * Downloading Url: {}", cleanUrl);
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8);
return DocumentHelper.parseText(new String(bytes));
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpGet req = new HttpGet(cleanUrl);
req.setHeader(HttpHeaders.ACCEPT, "application/xml");
try (final CloseableHttpResponse response = client.execute(req)) {
if (endPage == Integer.MAX_VALUE)
for (final Header header : response.getAllHeaders()) {
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
if ("Link-Pages".equals(header.getName())) {
if (Integer.parseInt(header.getValue()) < endPage)
endPage = Integer.parseInt(header.getValue());
}
}
final String content = IOUtils.toString(response.getEntity().getContent());
return DocumentHelper.parseText(content);
}
} catch (final Throwable e) {
if (attempt == -1)
try {
return DocumentHelper.parseText("<empty></empty>");
} catch (Throwable t) {
throw new RuntimeException();
}
log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
if (attempt >= MAX_ATTEMPTS) {
throw new RuntimeException("Error downloading url: " + cleanUrl, e);

View File

@ -13,7 +13,7 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
class Gtr2PublicationsIteratorTest {
private static final String baseURL = "https://gtr.ukri.org/gtr/api";
private static final String baseURL = "https://gtr.ukri.org/api";
private static final HttpClientParams clientParams = new HttpClientParams();
@ -34,7 +34,7 @@ class Gtr2PublicationsIteratorTest {
@Test
@Disabled
public void testPaging() throws Exception {
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "2", clientParams);
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "3", clientParams);
while (iterator.hasNext()) {
Thread.sleep(300);
@ -47,9 +47,9 @@ class Gtr2PublicationsIteratorTest {
@Test
@Disabled
public void testOnePage() throws Exception {
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "12", "12", clientParams);
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "379", "380", clientParams);
final int count = iterateAndCount(iterator);
assertEquals(20, count);
assertEquals(50, count);
}
@Test