2022-06-21 23:07:06 +02:00
|
|
|
|
2022-04-07 14:06:38 +02:00
|
|
|
package eu.dnetlib.dhp.collection.plugin.file;
|
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
import java.io.BufferedInputStream;
|
|
|
|
|
2022-04-28 15:31:11 +02:00
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
2022-04-07 14:06:38 +02:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
2022-04-07 14:06:38 +02:00
|
|
|
|
|
|
|
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
|
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
|
2022-04-07 14:06:38 +02:00
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
public FileCollectorPlugin(FileSystem fileSystem) {
|
|
|
|
super(fileSystem);
|
|
|
|
}
|
2022-04-28 15:31:11 +02:00
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
@Override
|
|
|
|
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
|
2022-04-07 14:06:38 +02:00
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
log.info("filePath: {}", filePath);
|
2022-04-07 14:06:38 +02:00
|
|
|
|
2022-06-21 23:07:06 +02:00
|
|
|
try {
|
|
|
|
FileSystem fs = super.getFileSystem();
|
|
|
|
return new BufferedInputStream(fs.open(filePath));
|
|
|
|
} catch (Exception e) {
|
|
|
|
throw new CollectorException("Error reading file " + filePath, e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|