dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/archive/targz/TarGzIterable.java

49 lines
1.5 KiB
Java

package eu.dnetlib.data.collector.plugins.archive.targz;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* The Class TarGzIterable.
*
* @author Andrea
*/
public class TarGzIterable implements Iterable<String> {
/** The path to tar.gz archive. */
private File tarGzFile;
public TarGzIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException {
try {
final String tarGzPath = interfaceDescriptor.getBaseUrl();
URL tarGzUrl = new URL(tarGzPath);
this.tarGzFile = new File(tarGzUrl.getPath());
if (!tarGzFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", tarGzFile.getPath())); }
} catch (MalformedURLException e) {
throw new CollectorServiceException("TarGz collector failed! ", e);
}
}
@Override
public Iterator<String> iterator() {
final TarGzIterator tgzIterator = new TarGzIterator(tarGzFile.getAbsolutePath());
return Iterators.transform(tgzIterator, new Function<String, String>() {
@Override
public String apply(final String inputRecord) {
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord);
}
});
}
}