You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java

103 lines
3.3 KiB
Java

package eu.dnetlib.dhp.collection.plugin.oai;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class OaiCollectorPlugin implements CollectorPlugin {
public static final String DATE_REGEX = "\\d{4}-\\d{2}-\\d{2}";
public static final String UTC_DATETIME_REGEX = "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z";
private static final String FORMAT_PARAM = "format";
private static final String OAI_SET_PARAM = "set";
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
private static final Object OAI_UNTIL_DATE_PARAM = "untilDate";
private OaiIteratorFactory oaiIteratorFactory;
private HttpClientParams clientParams;
public OaiCollectorPlugin(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String mdFormat = api.getParams().get(FORMAT_PARAM);
final String setParam = api.getParams().get(OAI_SET_PARAM);
final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM);
final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM);
final List<String> sets = new ArrayList<>();
if (setParam != null) {
sets
.addAll(
Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
}
if (sets.isEmpty()) {
// If no set is defined, ALL the sets must be harvested
sets.add("");
}
if (baseUrl == null || baseUrl.isEmpty()) {
throw new CollectorException("Param 'baseurl' is null or empty");
}
if (mdFormat == null || mdFormat.isEmpty()) {
throw new CollectorException("Param 'mdFormat' is null or empty");
}
if (fromDate != null && !fromDate.matches(DATE_REGEX) && !fromDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD or YYYY-MM-DDT00:00:00Z): " + fromDate);
}
if (untilDate != null && !untilDate.matches(DATE_REGEX) && !untilDate.matches(UTC_DATETIME_REGEX)) {
throw new CollectorException("Invalid date (YYYY-MM-DD or YYYY-MM-DDT00:00:00Z): " + untilDate);
}
final Iterator<Iterator<String>> iters = sets
.stream()
.map(
set -> getOaiIteratorFactory()
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report))
.iterator();
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false);
}
public OaiIteratorFactory getOaiIteratorFactory() {
if (oaiIteratorFactory == null) {
oaiIteratorFactory = new OaiIteratorFactory();
}
return oaiIteratorFactory;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}