Collector Plugin for Thanados and support for collections in collections #3

Merged
enrico.ottonello merged 14 commits from thanados into master 2022-05-06 11:06:39 +02:00
4 changed files with 39 additions and 9 deletions
Showing only changes of commit fa71f9a7e1 - Show all commits

View File

@ -36,6 +36,11 @@
<artifactId>dnet-msro-service</artifactId> <artifactId>dnet-msro-service</artifactId>
<version>[7.0.0-SAXONHE-SOLR772-SNAPSHOT, 8.0.0-SAXONHE)</version> <version>[7.0.0-SAXONHE-SOLR772-SNAPSHOT, 8.0.0-SAXONHE)</version>
</dependency> </dependency>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.4.8</version>
</dependency>
</dependencies> </dependencies>
<properties> <properties>

View File

@ -4,6 +4,10 @@ import eu.dnetlib.data.collector.plugins.httplist.HttpListIterator;
import eu.dnetlib.rmi.data.CollectorServiceException; import eu.dnetlib.rmi.data.CollectorServiceException;
import eu.dnetlib.rmi.data.InterfaceDescriptor; import eu.dnetlib.rmi.data.InterfaceDescriptor;
import eu.dnetlib.rmi.data.plugin.AbstractCollectorPlugin; import eu.dnetlib.rmi.data.plugin.AbstractCollectorPlugin;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import java.util.Date;
public class ThanadosCollectorPlugin extends AbstractCollectorPlugin { public class ThanadosCollectorPlugin extends AbstractCollectorPlugin {
@ -12,6 +16,6 @@ public class ThanadosCollectorPlugin extends AbstractCollectorPlugin {
final String baseUrl = interfaceDescriptor.getBaseUrl(); final String baseUrl = interfaceDescriptor.getBaseUrl();
final String listAddress = interfaceDescriptor.getParams().get("listUrl"); final String listAddress = interfaceDescriptor.getParams().get("listUrl");
return () -> new ThanadosIterator(baseUrl, listAddress); return () -> new ThanadosIterator(baseUrl, listAddress, fromDate);
} }
} }

View File

@ -5,6 +5,7 @@ import com.google.gson.reflect.TypeToken;
import eu.dnetlib.data.collector.ThreadSafeIterator; import eu.dnetlib.data.collector.ThreadSafeIterator;
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException; import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpStatus; import org.apache.http.HttpStatus;
@ -18,6 +19,11 @@ import java.io.IOException;
import java.lang.reflect.Type; import java.lang.reflect.Type;
import java.util.List; import java.util.List;
import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
import net.minidev.json.parser.JSONParser;
import net.minidev.json.parser.ParseException;
public class ThanadosIterator extends ThreadSafeIterator { public class ThanadosIterator extends ThreadSafeIterator {
private static final Log log = LogFactory.getLog(ThanadosIterator.class); private static final Log log = LogFactory.getLog(ThanadosIterator.class);
@ -27,12 +33,15 @@ public class ThanadosIterator extends ThreadSafeIterator {
private List<String> identifiers; private List<String> identifiers;
private int counter = 0; private int counter = 0;
private String urlFormat = "%s/%s?format=xml"; private String urlFormat = "%s/%s?format=xml";
private String fromDate;
public ThanadosIterator(final String baseUrl, final String listAddress) {
public ThanadosIterator(final String baseUrl, final String listAddress, final String fromDate) {
try { try {
this.baseUrl = baseUrl; this.baseUrl = baseUrl;
this.identifiers = downloadIdentifierList(listAddress); this.identifiers = downloadIdentifierList(listAddress, fromDate);
this.counter = 0; this.counter = 0;
this.fromDate = fromDate;
} catch (Exception e) { } catch (Exception e) {
throw new CollectorServiceRuntimeException("Error creating iterator", e); throw new CollectorServiceRuntimeException("Error creating iterator", e);
} }
@ -59,12 +68,17 @@ public class ThanadosIterator extends ThreadSafeIterator {
} }
} }
protected List<String> downloadIdentifierList(final String listUrl) { protected List<String> downloadIdentifierList(final String listUrl, final String fromDate) throws ParseException {
String urlToListItems = listUrl;
String list = download(listUrl); if(StringUtils.isNotBlank(fromDate))
urlToListItems = listUrl+"/"+fromDate;
log.info("Getting list of items from "+urlToListItems);
String response = download(urlToListItems);
JSONObject map = (JSONObject)(new JSONParser(JSONParser.MODE_PERMISSIVE).parse(response));
final String sites = map.getAsString("\"sites\"");
Gson converter = new Gson(); Gson converter = new Gson();
Type type = new TypeToken<List<String>>(){}.getType(); Type type = new TypeToken<List<String>>(){}.getType();
return converter.fromJson(list, type ); return converter.fromJson(sites, type );
} }

View File

@ -11,7 +11,7 @@ public class ThanadosIteratorTest {
@Test @Test
public void testDownloadList(){ public void testDownloadList(){
it = new ThanadosIterator("", sitelist); it = new ThanadosIterator("", sitelist, "");
it.getIdentifiers().stream().forEach(id -> System.out.println(id)); it.getIdentifiers().stream().forEach(id -> System.out.println(id));
System.out.println(it.getIdentifiers().size()); System.out.println(it.getIdentifiers().size());
} }
@ -19,7 +19,7 @@ public class ThanadosIteratorTest {
@Test @Test
public void testDownload(){ public void testDownload(){
int count = 0; int count = 0;
it = new ThanadosIterator("https://thanados.openatlas.eu/api/0.3/subunits", sitelist); it = new ThanadosIterator("https://thanados.openatlas.eu/api/0.3/subunits", sitelist, "");
while(it.hasNext()){ while(it.hasNext()){
it.next(); it.next();
count++; count++;
@ -29,4 +29,11 @@ public class ThanadosIteratorTest {
} }
@Test
public void testIncremental(){
it = new ThanadosIterator("", sitelist, "2022-03-12");
it.getIdentifiers().stream().forEach(id -> System.out.println(id));
System.out.println(it.getIdentifiers().size());
}
} }