Add FileCollectorPlugin and respective test

This commit is contained in:
Serafeim Chatzopoulos 2022-04-07 15:06:38 +03:00
parent bc1bf55507
commit d0b84d3297
8 changed files with 1163 additions and 4 deletions

View File

@ -7,6 +7,7 @@ import java.io.IOException;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@ -115,6 +116,8 @@ public class CollectorWorker extends ReportingJob {
return new OaiCollectorPlugin(clientParams); return new OaiCollectorPlugin(clientParams);
case rest_json2xml: case rest_json2xml:
return new RestCollectorPlugin(clientParams); return new RestCollectorPlugin(clientParams);
case file:
return new FileCollectorPlugin();
case fileGZip: case fileGZip:
return new FileGZipCollectorPlugin(); return new FileGZipCollectorPlugin();
case other: case other:

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin { public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml, fileGZip; oai, other, rest_json2xml, file, fileGZip;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
@Override
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorException {
log.info("baseUrl: {}", baseUrl);
try {
return new BufferedInputStream(new FileInputStream(baseUrl));
} catch (Exception e) {
throw new CollectorException("Error reading file " + baseUrl, e);
}
}
}

View File

@ -22,8 +22,7 @@ public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(baseUrl)); GZIPInputStream stream = new GZIPInputStream(new FileInputStream(baseUrl));
return new BufferedInputStream(stream); return new BufferedInputStream(stream);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); throw new CollectorException("Error reading file " + baseUrl, e);
throw new CollectorException(e);
} }
} }
} }

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.stream.Stream;
public class FileCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "repository";
@BeforeEach
public void setUp() {
final String gzipFile = this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml")
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
plugin = new FileCollectorPlugin();
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}

View File

@ -26,7 +26,7 @@ public class FileGZipCollectorPluginTest {
final String gzipFile = this final String gzipFile = this
.getClass() .getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/gzip/opendoar.xml.gz") .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz")
.getFile(); .getFile();
api.setBaseUrl(gzipFile); api.setBaseUrl(gzipFile);