7096-fileGZip-collector-plugin #211

Merged
claudio.atzori merged 9 commits from 7096-fileGZip-collector-plugin into beta 2022-06-16 15:34:45 +02:00
14 changed files with 1526 additions and 5 deletions

View File

@ -7,6 +7,8 @@ import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob {
return new OaiCollectorPlugin(clientParams);
case rest_json2xml:
return new RestCollectorPlugin(clientParams);
case file:
return new FileCollectorPlugin(fileSystem);
case fileGZip:
return new FileGZipCollectorPlugin(fileSystem);
case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type"))

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin {
enum NAME {
oai, other, rest_json2xml;
oai, other, rest_json2xml, file, fileGZip;
public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
public static final String SPLIT_ON_ELEMENT = "splitOnElement";
private final FileSystem fileSystem;
public AbstractSplittedRecordPlugin(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
// get path to file
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow( () -> new CollectorException("missing baseUrl"));
log.info("baseUrl: {}", filePath);
// check that path to file exists
try {
if (!fileSystem.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (IOException e) {
throw new CollectorException(e);
}
// get split element
final String splitOnElement = Optional
.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
log.info("splitOnElement: {}", splitOnElement);
final BufferedInputStream bis = getBufferedInputStream(filePath);
Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
false
);
}
abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;
public FileSystem getFileSystem() {
return fileSystem;
}
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
public FileCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {

We are assuming that the files to be processed by these plugins are already stored on HDFS, right? I'm not sure that accessing them in this way it would work. I think the access should be made through the org.apache.hadoop.fs.FileSystem object, similarily to how it is done on this other class: https://code-repo.d4science.org/D-Net/dnet-hadoop/src/branch/master/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java#L84

We are assuming that the files to be processed by these plugins are already stored on HDFS, right? I'm not sure that accessing them in this way it would work. I think the access should be made through the `org.apache.hadoop.fs.FileSystem` object, similarily to how it is done on this other class: https://code-repo.d4science.org/D-Net/dnet-hadoop/src/branch/master/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java#L84

Thank you for pointing this, I have updated the code to use org.apache.hadoop.fs.FileSystem for that purpose.

Thank you for pointing this, I have updated the code to use `org.apache.hadoop.fs.FileSystem` for that purpose.
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
return new BufferedInputStream(fs.open(filePath));
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.util.zip.GZIPInputStream;
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class);
public FileGZipCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
schatz marked this conversation as resolved
Review

We are assuming that the files to be processed by these plugins are already stored on HDFS, right? I'm not sure that accessing them in this way it would work. I think the access should be made through the org.apache.hadoop.fs.FileSystem object, similarily to how it is done on this other class: https://code-repo.d4science.org/D-Net/dnet-hadoop/src/branch/master/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java#L84

We are assuming that the files to be processed by these plugins are already stored on HDFS, right? I'm not sure that accessing them in this way it would work. I think the access should be made through the org.apache.hadoop.fs.FileSystem object, similarily to how it is done on this other class: https://code-repo.d4science.org/D-Net/dnet-hadoop/src/branch/master/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java#L84
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
GZIPInputStream stream = new GZIPInputStream(fs.open(filePath));
return new BufferedInputStream(stream);
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.XmlCleaner;
import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpConnector2;

View File

@ -30,7 +30,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.JsonUtils;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.plugin.utils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

View File

@ -0,0 +1,170 @@
package eu.dnetlib.dhp.collection.plugin.utils;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Iterator;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class XMLIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(XMLIterator.class);
private ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
@Override
protected XMLInputFactory initialValue() {
return XMLInputFactory.newInstance();
}
};
private ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
@Override
protected XMLOutputFactory initialValue() {
return XMLOutputFactory.newInstance();
}
};
private ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
@Override
protected XMLEventFactory initialValue() {
return XMLEventFactory.newInstance();
}
};
public static final String UTF_8 = "UTF-8";
final XMLEventReader parser;
private XMLEvent current = null;
private String element;
private InputStream inputStream;
public XMLIterator(final String element, final InputStream inputStream) {
super();
this.element = element;
this.inputStream = inputStream;
this.parser = getParser();
try {
this.current = findElement(parser);
} catch (XMLStreamException e) {
log.warn("cannot init parser position. No element found: " + element);
current = null;
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public String next() {
String result = null;
try {
result = copy(parser);
current = findElement(parser);
return result;
} catch (XMLStreamException e) {
throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@SuppressWarnings("finally")
private String copy(final XMLEventReader parser) throws XMLStreamException {
final StringWriter result = new StringWriter();
try {
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
final StartElement start = current.asStartElement();
final StartElement newRecord = eventFactory.get().createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
// new root record
writer.add(newRecord);
// copy the rest as it is
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
// TODO: replace with depth tracking instead of close tag tracking.
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
writer.add(event);
break;
}
writer.add(event);
}
writer.close();
} finally {
return result.toString();
}
}
/**
* Looks for the next occurrence of the splitter element.
*
* @param parser
* @return
* @throws XMLStreamException
*/
private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException {
/*
* if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; }
*/
XMLEvent peek = parser.peek();
if (peek != null && peek.isStartElement()) {
String name = peek.asStartElement().getName().getLocalPart();
if (element.equals(name)) { return peek; }
}
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
if (event != null && event.isStartElement()) {
String name = event.asStartElement().getName().getLocalPart();
if (element.equals(name)) { return event; }
}
}
return null;
}
private XMLEventReader getParser() {
try {
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
}
private Reader sanitize(final InputStream in) {
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
return new InputStreamReader(in, charsetDecoder);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection;
package eu.dnetlib.dhp.collection.plugin.utils;
import java.util.HashMap;
import java.util.HashSet;

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import net.bytebuddy.asm.Advice;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.stream.Stream;
public class FileCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "repository";
@BeforeEach
public void setUp() throws IOException {
final String gzipFile = this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml")
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
FileSystem fs = FileSystem.get(new Configuration());
plugin = new FileCollectorPlugin(fs);
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}

View File

@ -0,0 +1,65 @@
package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Objects;
import java.util.stream.Stream;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)
public class FileGZipCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileGZipCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "repository";
@BeforeEach
public void setUp() throws IOException {
final String gzipFile = Objects.requireNonNull(this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz"))
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
FileSystem fs = FileSystem.get(new Configuration());
plugin = new FileGZipCollectorPlugin(fs);
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}