package eu.dnetlib.data.collector.plugins.archive.targz; import java.io.File; import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import com.google.common.base.Function; import com.google.common.collect.Iterators; import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; import eu.dnetlib.data.collector.rmi.CollectorServiceException; import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; /** * The Class TarGzIterable. * * @author Andrea */ public class TarGzIterable implements Iterable { /** The path to tar.gz archive. */ private File tarGzFile; public TarGzIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException { try { final String tarGzPath = interfaceDescriptor.getBaseUrl(); URL tarGzUrl = new URL(tarGzPath); this.tarGzFile = new File(tarGzUrl.getPath()); if (!tarGzFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", tarGzFile.getPath())); } } catch (MalformedURLException e) { throw new CollectorServiceException("TarGz collector failed! ", e); } } @Override public Iterator iterator() { final TarGzIterator tgzIterator = new TarGzIterator(tarGzFile.getAbsolutePath()); return Iterators.transform(tgzIterator, new Function() { @Override public String apply(final String inputRecord) { return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord); } }); } }