dnet-hadoop/dhp-common/src/main/java/eu/dnetlib/dhp/common/java/io/DataStore.java

173 lines
5.3 KiB
Java

package eu.dnetlib.dhp.common.java.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
/**
* Utility for accessing to Avro-based data stores stored in file system
* @author Mateusz Kobos
*
*/
public final class DataStore {
private final static String singleDataStoreFileName = "content.avro";
private static final int FILE_NO_PADDING_LENGTH = 7;
private DataStore(){}
/**
* Create a new data store directory with single file and return writer that allows
* adding new records
* @param path path to a directory to be created
* @param schema schema of the records to be stored in the file
* @return
* @throws IOException
*/
public static <T> DataFileWriter<T> create(
FileSystemPath path, Schema schema) throws IOException{
return create(path, schema, singleDataStoreFileName);
}
/**
* Create a new data store directory and return writer that allows
* adding new records
* @param path path to a directory to be created
* @param schema schema of the records to be stored in the file
* @param dataStoreFileName datastore file name
* @return
* @throws IOException
*/
public static <T> DataFileWriter<T> create(
FileSystemPath path, Schema schema, String dataStoreFileName) throws IOException{
path.getFileSystem().mkdirs(path.getPath());
FileSystemPath outFile = new FileSystemPath(
path, dataStoreFileName);
return DataStore.createSingleFile(outFile, schema);
}
/**
* Get reader for reading records from given data store
*
* Here the schema used for reading the data store is set to be the same
* as the one that was used to write it.
*
* @see getReader(FileSystemPath path, Schema readerSchema) for details.
*
*/
public static <T> CloseableIterator<T> getReader(FileSystemPath path)
throws IOException{
return getReader(path, null);
}
/**
* Get reader for reading records from given data store
* @param path path to a directory corresponding to data store
* @param readerSchema the schema onto which the read data store will
* be projected
*/
public static <T> CloseableIterator<T> getReader(
FileSystemPath path, Schema readerSchema) throws IOException{
return new AvroDataStoreReader<T>(path, readerSchema);
}
/**
* Read data store entries and insert them into a list. A utility function.
*
* Here the schema used for reading the data store is set to be the same
* as the one that was used to write it.
*/
public static <T> List<T> read(FileSystemPath path)
throws IOException{
return read(path, null);
}
/**
* Read data store entries and insert them into a list. A utility function.
*
* @param readerSchema the schema onto which the read data store will
* be projected
*/
public static <T> List<T> read(FileSystemPath path, Schema readerSchema)
throws IOException{
CloseableIterator<T> iterator = getReader(path, readerSchema);
List<T> elems = new ArrayList<T>();
while(iterator.hasNext()){
elems.add(iterator.next());
}
return elems;
}
/**
* Create a data store from a list of entries. A utility function.
* The schema is implicitly
* taken from the first element from the {@code elements} list.
* @param elements list of elements to write. At least one element has
* to be present, because it is used to retrieve schema of the
* structures passed in the list.
*/
public static <T extends GenericContainer> void create(
List<T> elements, FileSystemPath path) throws IOException{
if(elements.isEmpty()){
throw new IllegalArgumentException(
"The list of elements has to be non-empty");
}
Schema schema = elements.get(0).getSchema();
create(elements, path, schema);
}
/**
* Create a data store from a list of entries with schema given explicitly.
* A utility function.
*/
public static <T extends GenericContainer> void create(
List<T> elements, FileSystemPath path, Schema schema)
throws IOException{
DataFileWriter<T> writer = create(path, schema);
try{
for(T i: elements){
writer.append(i);
}
} finally {
if(writer != null){
writer.close();
}
}
}
/**
* Create a single Avro file. This method shouldn't be normally used to
* create data stores since it creates only a single Avro file,
* while a data store consists of a directory containing one or more files.
*/
public static <T> DataFileWriter<T> createSingleFile(
FileSystemPath path, Schema schema) throws IOException{
DatumWriter<T> datumWriter = new SpecificDatumWriter<T>();
DataFileWriter<T> writer = new DataFileWriter<T>(datumWriter);
writer.create(schema, path.getFileSystem().create(path.getPath()));
return writer;
}
/**
* Generates filename for given file number.
* @param fileNo file sequence number
*/
public static String generateFileName(int fileNo) {
StringBuffer strBuff = new StringBuffer(String.valueOf(fileNo));
while(strBuff.length()<FILE_NO_PADDING_LENGTH) {
strBuff.insert(0, '0');
}
strBuff.append(".avro");
return strBuff.toString();
}
}