dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIterator.java

172 lines
5.5 KiB
Java
Raw Normal View History

2024-02-14 11:39:37 +01:00
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
2024-02-15 08:21:52 +01:00
import java.io.StringWriter;
import java.util.Iterator;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
2024-02-15 08:21:52 +01:00
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
2024-02-09 12:36:20 +01:00
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2024-02-09 12:36:20 +01:00
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
2024-02-15 08:21:52 +01:00
public class BaseCollectorIterator implements Iterator<String> {
2024-02-15 08:21:52 +01:00
private String nextElement;
2024-02-28 10:51:13 +01:00
private final BlockingQueue<String> queue = new LinkedBlockingQueue<>(100);
2024-02-09 14:33:04 +01:00
private static final Logger log = LoggerFactory.getLogger(BaseCollectorIterator.class);
2024-02-09 12:36:20 +01:00
2024-02-15 08:21:52 +01:00
private static final String END_ELEM = "__END__";
2024-02-09 12:36:20 +01:00
public BaseCollectorIterator(final FileSystem fs, final Path filePath, final AggregatorReport report) {
new Thread(() -> importHadoopFile(fs, filePath, report)).start();
2024-02-09 14:33:04 +01:00
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
2024-02-09 12:36:20 +01:00
}
2024-02-09 12:36:20 +01:00
protected BaseCollectorIterator(final String resourcePath, final AggregatorReport report) {
new Thread(() -> importTestFile(resourcePath, report)).start();
2024-02-09 14:33:04 +01:00
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
}
@Override
2024-02-09 12:36:20 +01:00
public synchronized boolean hasNext() {
2024-02-15 08:21:52 +01:00
return (this.nextElement != null) & !END_ELEM.equals(this.nextElement);
}
@Override
2024-02-15 08:21:52 +01:00
public synchronized String next() {
try {
2024-02-15 08:21:52 +01:00
return END_ELEM.equals(this.nextElement) ? null : this.nextElement;
2024-02-09 14:33:04 +01:00
} finally {
try {
this.nextElement = this.queue.take();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
}
2024-02-09 14:33:04 +01:00
}
2024-02-09 12:36:20 +01:00
private void importHadoopFile(final FileSystem fs, final Path filePath, final AggregatorReport report) {
log.info("I start to read the TAR stream");
2024-02-09 12:36:20 +01:00
try (InputStream origInputStream = fs.open(filePath);
2024-02-14 11:39:37 +01:00
final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
2024-02-12 15:35:36 +01:00
importTarStream(tarInputStream, report);
2024-02-09 12:36:20 +01:00
} catch (final Throwable e) {
throw new RuntimeException("Error processing BASE records", e);
}
}
2024-02-09 12:36:20 +01:00
private void importTestFile(final String resourcePath, final AggregatorReport report) {
try (final InputStream origInputStream = BaseCollectorIterator.class.getResourceAsStream(resourcePath);
2024-02-14 11:39:37 +01:00
final TarArchiveInputStream tarInputStream = new TarArchiveInputStream(origInputStream)) {
2024-02-12 15:35:36 +01:00
importTarStream(tarInputStream, report);
2024-02-09 12:36:20 +01:00
} catch (final Throwable e) {
throw new RuntimeException("Error processing BASE records", e);
}
}
2024-02-12 15:35:36 +01:00
private void importTarStream(final TarArchiveInputStream tarInputStream, final AggregatorReport report) {
2024-02-09 12:36:20 +01:00
long count = 0;
2024-02-15 08:21:52 +01:00
final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
2024-02-12 15:35:36 +01:00
try {
TarArchiveEntry entry;
while ((entry = (TarArchiveEntry) tarInputStream.getNextEntry()) != null) {
final String name = entry.getName();
if (!entry.isDirectory() && name.contains("ListRecords") && name.endsWith(".bz2")) {
2024-02-12 15:35:36 +01:00
log.info("Processing file (BZIP): " + name);
2024-02-12 15:35:36 +01:00
final byte[] bzipData = new byte[(int) entry.getSize()];
IOUtils.readFully(tarInputStream, bzipData);
2024-02-12 15:35:36 +01:00
try (InputStream bzipIs = new ByteArrayInputStream(bzipData);
2024-02-14 11:39:37 +01:00
final BufferedInputStream bzipBis = new BufferedInputStream(bzipIs);
final CompressorInputStream bzipInput = new CompressorStreamFactory()
.createCompressorInputStream(bzipBis)) {
2024-02-15 08:21:52 +01:00
final XMLEventReader reader = xmlInputFactory.createXMLEventReader(bzipInput);
2024-02-15 08:21:52 +01:00
XMLEventWriter eventWriter = null;
StringWriter xmlWriter = null;
while (reader.hasNext()) {
final XMLEvent nextEvent = reader.nextEvent();
if (nextEvent.isStartElement()) {
final StartElement startElement = nextEvent.asStartElement();
if ("record".equals(startElement.getName().getLocalPart())) {
xmlWriter = new StringWriter();
eventWriter = xmlOutputFactory.createXMLEventWriter(xmlWriter);
}
}
2024-02-09 12:36:20 +01:00
2024-02-15 08:21:52 +01:00
if (eventWriter != null) {
eventWriter.add(nextEvent);
2024-02-12 15:35:36 +01:00
}
2024-02-15 08:21:52 +01:00
if (nextEvent.isEndElement()) {
final EndElement endElement = nextEvent.asEndElement();
if ("record".equals(endElement.getName().getLocalPart())) {
eventWriter.flush();
eventWriter.close();
this.queue.put(xmlWriter.toString());
eventWriter = null;
xmlWriter = null;
count++;
}
}
}
}
}
}
2024-02-12 12:19:57 +01:00
2024-02-15 08:21:52 +01:00
this.queue.put(END_ELEM); // TO INDICATE THE END OF THE QUEUE
2024-02-12 15:35:36 +01:00
} catch (final Throwable e) {
log.error("Error processing BASE records", e);
report.put(e.getClass().getName(), e.getMessage());
throw new RuntimeException("Error processing BASE records", e);
} finally {
log.info("Total records (written in queue): " + count);
}
2024-02-09 12:36:20 +01:00
}
}