forked from D-Net/dnet-hadoop
102 lines
3.1 KiB
Java
102 lines
3.1 KiB
Java
|
|
package eu.dnetlib.dhp.collection.plugin.oai;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Spliterator;
|
|
import java.util.Spliterators;
|
|
import java.util.stream.Stream;
|
|
import java.util.stream.StreamSupport;
|
|
|
|
import org.jetbrains.annotations.NotNull;
|
|
|
|
import com.google.common.base.Splitter;
|
|
import com.google.common.collect.Iterators;
|
|
import com.google.common.collect.Lists;
|
|
|
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
|
|
|
public class OaiCollectorPlugin implements CollectorPlugin {
|
|
|
|
private static final String FORMAT_PARAM = "format";
|
|
private static final String OAI_SET_PARAM = "set";
|
|
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
|
|
private static final Object OAI_UNTIL_DATE_PARAM = "untilDate";
|
|
|
|
private OaiIteratorFactory oaiIteratorFactory;
|
|
|
|
private final CollectorPluginErrorLogList errorLogList = new CollectorPluginErrorLogList();
|
|
|
|
@Override
|
|
public Stream<String> collect(final ApiDescriptor api) throws CollectorException {
|
|
try {
|
|
return doCollect(api);
|
|
} catch (CollectorException e) {
|
|
errorLogList.add(e.getMessage());
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
private Stream<String> doCollect(ApiDescriptor api) throws CollectorException {
|
|
final String baseUrl = api.getBaseUrl();
|
|
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
|
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
|
final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM);
|
|
final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM);
|
|
|
|
final List<String> sets = new ArrayList<>();
|
|
if (setParam != null) {
|
|
sets
|
|
.addAll(
|
|
Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
|
|
}
|
|
if (sets.isEmpty()) {
|
|
// If no set is defined, ALL the sets must be harvested
|
|
sets.add("");
|
|
}
|
|
|
|
if (baseUrl == null || baseUrl.isEmpty()) {
|
|
throw new CollectorException("Param 'baseurl' is null or empty");
|
|
}
|
|
|
|
if (mdFormat == null || mdFormat.isEmpty()) {
|
|
throw new CollectorException("Param 'mdFormat' is null or empty");
|
|
}
|
|
|
|
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
|
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
|
|
}
|
|
|
|
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
|
throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate);
|
|
}
|
|
|
|
final Iterator<Iterator<String>> iters = sets
|
|
.stream()
|
|
.map(
|
|
set -> getOaiIteratorFactory()
|
|
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, errorLogList))
|
|
.iterator();
|
|
|
|
return StreamSupport
|
|
.stream(
|
|
Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false);
|
|
}
|
|
|
|
public OaiIteratorFactory getOaiIteratorFactory() {
|
|
if (oaiIteratorFactory == null) {
|
|
oaiIteratorFactory = new OaiIteratorFactory();
|
|
}
|
|
return oaiIteratorFactory;
|
|
}
|
|
|
|
@Override
|
|
public CollectorPluginErrorLogList getCollectionErrors() {
|
|
return errorLogList;
|
|
}
|
|
}
|