
156 lines
4.3 KiB
Raw Normal View History

2018-01-16 14:21:13 +01:00
package eu.dnetlib.dhp.common.java.io;
import java.io.IOException;
import java.util.NoSuchElementException;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.RemoteIterator;
* An abstraction over data store format which allows
* iterating over records stored in the data store.
* It handles the standard case of a data store that is a directory containing
* many Avro files (but it can also read records from a single file).
* @author mhorst
* @author Mateusz Kobos
class AvroDataStoreReader<T> implements CloseableIterator<T> {
private DataFileReader<T> currentReader;
private RemoteIterator<LocatedFileStatus> fileIterator;
private final FileSystemPath path;
private final Schema readerSchema;
* Ignore file starting with underscore. Such files are also ignored by
* default by map-reduce jobs.
private final Pattern whitelistPattern = Pattern.compile("^(?!_).*");
* Here the schema used for reading the data store is set to be the same
* as the one that was used to write it.
public AvroDataStoreReader(final FileSystemPath path)
throws IOException {
this(path, null);
* @param path path to the data store to be read
* @param readerSchema the schema onto which the read data store will
* be projected
public AvroDataStoreReader(final FileSystemPath path, Schema readerSchema)
throws IOException {
this.path = path;
this.readerSchema = readerSchema;
fileIterator = path.getFileSystem().listFiles(path.getPath(), false);
currentReader = getNextNonemptyReader();
private DataFileReader<T> getNextNonemptyReader() throws IOException {
while (fileIterator != null && fileIterator.hasNext()) {
LocatedFileStatus currentFileStatus = fileIterator.next();
if (isValidFile(currentFileStatus)) {
FileSystemPath currPath = new FileSystemPath(
path.getFileSystem(), currentFileStatus.getPath());
DataFileReader<T> reader =
getSingleFileReader(currPath, readerSchema);
/** Check if the file contains at least one record */
return reader;
} else {
/** fallback */
return null;
* Get a reader for the specified Avro file. A utility function.
* @param path path to the existing file
* @param readerSchema optional reader schema. If you want to use the
* default option of using writer schema as the reader schema, pass the
* {@code null} value.
* @throws IOException
private static <T> DataFileReader<T> getSingleFileReader(
FileSystemPath path, Schema readerSchema) throws IOException{
SpecificDatumReader<T> datumReader = new SpecificDatumReader<T>();
if(readerSchema != null){
long len = path.getFileSystem().getFileStatus(path.getPath()).getLen();
FSDataInputStream inputStream = path.getFileSystem().open(path.getPath());
return new DataFileReader<T>(
new AvroFSInput(inputStream, len), datumReader);
} catch (IOException ex){
throw new IOException("Problem with file \""+
path.getPath().toString()+"\": "+ex.getMessage(), ex);
* Checks whether file is valid
* @param fileStatus
* @return true when valid, false otherwise
private boolean isValidFile(LocatedFileStatus fileStatus) {
if (fileStatus.isFile()) {
return whitelistPattern.matcher(
/** fallback */
return false;
public boolean hasNext() {
return currentReader != null;
public T next(){
if(currentReader == null){
throw new NoSuchElementException();
T obj = currentReader.next();
currentReader = getNextNonemptyReader();
} catch(IOException ex){
throw new RuntimeException(ex);
return obj;
public void remove() {
throw new UnsupportedOperationException();
public void close() throws IOException {
if(currentReader != null){
currentReader = null;
fileIterator = null;