forked from antonis.lempesis/dnet-hadoop
Add FileCollectorPlugin and respective test
This commit is contained in:
parent
bc1bf55507
commit
d0b84d3297
|
@ -7,6 +7,7 @@ import java.io.IOException;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
@ -115,6 +116,8 @@ public class CollectorWorker extends ReportingJob {
|
||||||
return new OaiCollectorPlugin(clientParams);
|
return new OaiCollectorPlugin(clientParams);
|
||||||
case rest_json2xml:
|
case rest_json2xml:
|
||||||
return new RestCollectorPlugin(clientParams);
|
return new RestCollectorPlugin(clientParams);
|
||||||
|
case file:
|
||||||
|
return new FileCollectorPlugin();
|
||||||
case fileGZip:
|
case fileGZip:
|
||||||
return new FileGZipCollectorPlugin();
|
return new FileGZipCollectorPlugin();
|
||||||
case other:
|
case other:
|
||||||
|
|
|
@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
public interface CollectorPlugin {
|
public interface CollectorPlugin {
|
||||||
|
|
||||||
enum NAME {
|
enum NAME {
|
||||||
oai, other, rest_json2xml, fileGZip;
|
oai, other, rest_json2xml, file, fileGZip;
|
||||||
|
|
||||||
public enum OTHER_NAME {
|
public enum OTHER_NAME {
|
||||||
mdstore_mongodb_dump, mdstore_mongodb
|
mdstore_mongodb_dump, mdstore_mongodb
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.file;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
|
||||||
|
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorException {
|
||||||
|
|
||||||
|
log.info("baseUrl: {}", baseUrl);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new BufferedInputStream(new FileInputStream(baseUrl));
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new CollectorException("Error reading file " + baseUrl, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,8 +22,7 @@ public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||||
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(baseUrl));
|
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(baseUrl));
|
||||||
return new BufferedInputStream(stream);
|
return new BufferedInputStream(stream);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
e.printStackTrace();
|
throw new CollectorException("Error reading file " + baseUrl, e);
|
||||||
throw new CollectorException(e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.file;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public class FileCollectorPluginTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
|
||||||
|
|
||||||
|
private final ApiDescriptor api = new ApiDescriptor();
|
||||||
|
private FileCollectorPlugin plugin;
|
||||||
|
|
||||||
|
private static final String SPLIT_ON_ELEMENT = "repository";
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
|
||||||
|
final String gzipFile = this
|
||||||
|
.getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml")
|
||||||
|
.getFile();
|
||||||
|
|
||||||
|
api.setBaseUrl(gzipFile);
|
||||||
|
|
||||||
|
HashMap<String, String> params = new HashMap<>();
|
||||||
|
params.put("splitOnElement", SPLIT_ON_ELEMENT);
|
||||||
|
|
||||||
|
api.setParams(params);
|
||||||
|
|
||||||
|
plugin = new FileCollectorPlugin();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void test() throws CollectorException {
|
||||||
|
|
||||||
|
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.limit(10).forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
log.info(s);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ public class FileGZipCollectorPluginTest {
|
||||||
|
|
||||||
final String gzipFile = this
|
final String gzipFile = this
|
||||||
.getClass()
|
.getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/collection/plugin/file/gzip/opendoar.xml.gz")
|
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz")
|
||||||
.getFile();
|
.getFile();
|
||||||
|
|
||||||
api.setBaseUrl(gzipFile);
|
api.setBaseUrl(gzipFile);
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue