used a parser STAX

This commit is contained in:
Michele Artini 2024-02-15 08:21:52 +01:00
parent b42e2b4d61
commit 8d85c1e97e
4 changed files with 84 additions and 63 deletions

View File

@ -26,6 +26,8 @@ import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.Attribute; import org.dom4j.Attribute;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.Node; import org.dom4j.Node;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -46,7 +48,8 @@ public class BaseAnalyzerJob {
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils final String jsonConfiguration = IOUtils
.toString(BaseAnalyzerJob.class .toString(
BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json")); .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -99,7 +102,7 @@ public class BaseAnalyzerJob {
final Map<String, AtomicLong> types, final Map<String, AtomicLong> types,
final Map<String, AtomicLong> collections, final Map<String, AtomicLong> collections,
final Map<String, AtomicLong> totals, final Map<String, AtomicLong> totals,
final AggregatorReport report) throws JsonProcessingException, IOException { final AggregatorReport report) throws JsonProcessingException, IOException, DocumentException {
final AtomicLong recordsCounter = new AtomicLong(0); final AtomicLong recordsCounter = new AtomicLong(0);
@ -108,7 +111,7 @@ public class BaseAnalyzerJob {
final BaseCollectorIterator iteraror = new BaseCollectorIterator(fs, new Path(inputPath), report); final BaseCollectorIterator iteraror = new BaseCollectorIterator(fs, new Path(inputPath), report);
while (iteraror.hasNext()) { while (iteraror.hasNext()) {
final Document record = iteraror.next(); final Document record = DocumentHelper.parseText(iteraror.next());
final long i = recordsCounter.incrementAndGet(); final long i = recordsCounter.incrementAndGet();
if ((i % 10000) == 0) { if ((i % 10000) == 0) {
@ -162,9 +165,12 @@ public class BaseAnalyzerJob {
private static void saveReport(final FileSystem fs, final String outputPath, final Map<String, AtomicLong> fields) private static void saveReport(final FileSystem fs, final String outputPath, final Map<String, AtomicLong> fields)
throws JsonProcessingException, IOException { throws JsonProcessingException, IOException {
try (final SequenceFile.Writer writer = SequenceFile try (final SequenceFile.Writer writer = SequenceFile
.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer .createWriter(
.keyClass(IntWritable.class), SequenceFile.Writer fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { .keyClass(IntWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final Text key = new Text(); final Text key = new Text();
final Text value = new Text(); final Text value = new Text();

View File

@ -4,11 +4,19 @@ package eu.dnetlib.dhp.collection.plugin.base;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.StringWriter;
import java.util.Iterator; import java.util.Iterator;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.CompressorInputStream; import org.apache.commons.compress.compressors.CompressorInputStream;
@ -16,22 +24,21 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
public class BaseCollectorIterator implements Iterator<Document> { public class BaseCollectorIterator implements Iterator<String> {
private Object nextElement; private String nextElement;
private final BlockingQueue<Object> queue = new LinkedBlockingQueue<>(20); private final BlockingQueue<String> queue = new LinkedBlockingQueue<>(20);
private static final Logger log = LoggerFactory.getLogger(BaseCollectorIterator.class); private static final Logger log = LoggerFactory.getLogger(BaseCollectorIterator.class);
private static final String END_ELEM = "__END__";
public BaseCollectorIterator(final FileSystem fs, final Path filePath, final AggregatorReport report) { public BaseCollectorIterator(final FileSystem fs, final Path filePath, final AggregatorReport report) {
new Thread(() -> importHadoopFile(fs, filePath, report)).start(); new Thread(() -> importHadoopFile(fs, filePath, report)).start();
try { try {
@ -52,13 +59,13 @@ public class BaseCollectorIterator implements Iterator<Document> {
@Override @Override
public synchronized boolean hasNext() { public synchronized boolean hasNext() {
return (this.nextElement != null) && (this.nextElement instanceof Document); return (this.nextElement != null) & !END_ELEM.equals(this.nextElement);
} }
@Override @Override
public synchronized Document next() { public synchronized String next() {
try { try {
return this.nextElement instanceof Document ? (Document) this.nextElement : null; return END_ELEM.equals(this.nextElement) ? null : this.nextElement;
} finally { } finally {
try { try {
this.nextElement = this.queue.take(); this.nextElement = this.queue.take();
@ -92,6 +99,9 @@ public class BaseCollectorIterator implements Iterator<Document> {
private void importTarStream(final TarArchiveInputStream tarInputStream, final AggregatorReport report) { private void importTarStream(final TarArchiveInputStream tarInputStream, final AggregatorReport report) {
long count = 0; long count = 0;
final XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
try { try {
TarArchiveEntry entry; TarArchiveEntry entry;
while ((entry = (TarArchiveEntry) tarInputStream.getNextEntry()) != null) { while ((entry = (TarArchiveEntry) tarInputStream.getNextEntry()) != null) {
@ -109,24 +119,46 @@ public class BaseCollectorIterator implements Iterator<Document> {
final CompressorInputStream bzipInput = new CompressorStreamFactory() final CompressorInputStream bzipInput = new CompressorStreamFactory()
.createCompressorInputStream(bzipBis)) { .createCompressorInputStream(bzipBis)) {
final String xml = IOUtils.toString(new InputStreamReader(bzipInput)); final XMLEventReader reader = xmlInputFactory.createXMLEventReader(bzipInput);
final Document doc = DocumentHelper.parseText(xml); XMLEventWriter eventWriter = null;
StringWriter xmlWriter = null;
for (final Object o : doc while (reader.hasNext()) {
.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { final XMLEvent nextEvent = reader.nextEvent();
if (o instanceof Element) {
final Element newRoot = (Element) ((Element) o).detach(); if (nextEvent.isStartElement()) {
final Document newDoc = DocumentHelper.createDocument(newRoot); final StartElement startElement = nextEvent.asStartElement();
this.queue.put(newDoc); if ("record".equals(startElement.getName().getLocalPart())) {
xmlWriter = new StringWriter();
eventWriter = xmlOutputFactory.createXMLEventWriter(xmlWriter);
}
}
if (eventWriter != null) {
eventWriter.add(nextEvent);
}
if (nextEvent.isEndElement()) {
final EndElement endElement = nextEvent.asEndElement();
if ("record".equals(endElement.getName().getLocalPart())) {
eventWriter.flush();
eventWriter.close();
this.queue.put(xmlWriter.toString());
eventWriter = null;
xmlWriter = null;
count++; count++;
} }
} }
}
} }
} }
} }
this.queue.put("__END__"); // I ADD A NOT ELEMENT OBJECT TO INDICATE THE END OF THE QUEUE this.queue.put(END_ELEM); // TO INDICATE THE END OF THE QUEUE
} catch (final Throwable e) { } catch (final Throwable e) {
log.error("Error processing BASE records", e); log.error("Error processing BASE records", e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());

View File

@ -1,8 +1,6 @@
package eu.dnetlib.dhp.collection.plugin.base; package eu.dnetlib.dhp.collection.plugin.base;
import java.io.IOException;
import java.io.StringWriter;
import java.util.Iterator; import java.util.Iterator;
import java.util.Optional; import java.util.Optional;
import java.util.Spliterator; import java.util.Spliterator;
@ -12,9 +10,6 @@ import java.util.stream.StreamSupport;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.dom4j.Document;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -55,15 +50,14 @@ public class BaseCollectorPlugin implements CollectorPlugin {
throw new CollectorException(e); throw new CollectorException(e);
} }
final Iterator<Document> iterator = new BaseCollectorIterator(this.fs, filePath, report); final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
final Spliterator<Document> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED); final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport return StreamSupport
.stream(spliterator, false) .stream(spliterator, false)
.filter(doc -> filterXml(doc, report)) .filter(doc -> filterXml(doc, report));
.map(doc -> xmlToString(doc, report));
} }
private boolean filterXml(final Document doc, final AggregatorReport report) { private boolean filterXml(final String xml, final AggregatorReport report) {
// TODO Auto-generated method stub // TODO Auto-generated method stub
// HERE THE FILTERS ACCORDING TO THE DOCUMENTATION // HERE THE FILTERS ACCORDING TO THE DOCUMENTATION
@ -71,15 +65,4 @@ public class BaseCollectorPlugin implements CollectorPlugin {
return true; return true;
} }
private String xmlToString(final Document doc, final AggregatorReport report) {
try (final StringWriter sw = new StringWriter()) {
final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint());
writer.write(doc);
return writer.toString();
} catch (final IOException e) {
report.put(e.getClass().getName(), e.getMessage());
throw new RuntimeException("Error indenting XML record", e);
}
}
} }

View File

@ -13,16 +13,15 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Attribute; import org.dom4j.Attribute;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.Node; import org.dom4j.Node;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
@Disabled
public class BaseCollectorIteratorTest { public class BaseCollectorIteratorTest {
@Test @Test
@ -37,7 +36,8 @@ public class BaseCollectorIteratorTest {
final Set<String> types = new HashSet<>(); final Set<String> types = new HashSet<>();
while (iterator.hasNext()) { while (iterator.hasNext()) {
final Document record = iterator.next();
final Document record = DocumentHelper.parseText(iterator.next());
count++; count++;