forked from D-Net/dnet-hadoop
79 lines
2.8 KiB
Java
79 lines
2.8 KiB
Java
package eu.dnetlib.dhp.collection.plugin.oai;
|
|
|
|
import com.google.common.base.Splitter;
|
|
import com.google.common.collect.Iterators;
|
|
import com.google.common.collect.Lists;
|
|
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
|
|
import java.util.ArrayList;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Spliterator;
|
|
import java.util.Spliterators;
|
|
import java.util.stream.Stream;
|
|
import java.util.stream.StreamSupport;
|
|
|
|
public class OaiCollectorPlugin implements CollectorPlugin {
|
|
|
|
private static final String FORMAT_PARAM = "format";
|
|
private static final String OAI_SET_PARAM = "set";
|
|
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
|
|
private static final Object OAI_UNTIL_DATE_PARAM = "untilDate";
|
|
|
|
private OaiIteratorFactory oaiIteratorFactory;
|
|
|
|
@Override
|
|
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
|
|
final String baseUrl = api.getBaseUrl();
|
|
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
|
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
|
final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM);
|
|
final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM);
|
|
|
|
final List<String> sets = new ArrayList<>();
|
|
if (setParam != null) {
|
|
sets.addAll(
|
|
Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
|
|
}
|
|
if (sets.isEmpty()) {
|
|
// If no set is defined, ALL the sets must be harvested
|
|
sets.add("");
|
|
}
|
|
|
|
if (baseUrl == null || baseUrl.isEmpty()) {
|
|
throw new DnetCollectorException("Param 'baseurl' is null or empty");
|
|
}
|
|
|
|
if (mdFormat == null || mdFormat.isEmpty()) {
|
|
throw new DnetCollectorException("Param 'mdFormat' is null or empty");
|
|
}
|
|
|
|
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
|
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
|
}
|
|
|
|
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
|
throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
|
}
|
|
|
|
final Iterator<Iterator<String>> iters =
|
|
sets.stream()
|
|
.map(
|
|
set ->
|
|
getOaiIteratorFactory()
|
|
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
|
.iterator();
|
|
|
|
return StreamSupport.stream(
|
|
Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false);
|
|
}
|
|
|
|
public OaiIteratorFactory getOaiIteratorFactory() {
|
|
if (oaiIteratorFactory == null) {
|
|
oaiIteratorFactory = new OaiIteratorFactory();
|
|
}
|
|
return oaiIteratorFactory;
|
|
}
|
|
}
|