Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
7 changed files with 285 additions and 267 deletions
Showing only changes of commit 929b145130 - Show all commits

View File

@ -7,8 +7,6 @@ import java.io.IOException;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.IntWritable;
@ -21,6 +19,8 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback; import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.collection.plugin.file; package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
@ -9,17 +10,17 @@ import java.util.Spliterators;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator; import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin { public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class); private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
@ -39,7 +40,7 @@ public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
final Path filePath = Optional final Path filePath = Optional
.ofNullable(api.getBaseUrl()) .ofNullable(api.getBaseUrl())
.map(Path::new) .map(Path::new)
.orElseThrow( () -> new CollectorException("missing baseUrl")); .orElseThrow(() -> new CollectorException("missing baseUrl"));
log.info("baseUrl: {}", filePath); log.info("baseUrl: {}", filePath);
@ -55,7 +56,9 @@ public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
// get split element // get split element
final String splitOnElement = Optional final String splitOnElement = Optional
.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT)) .ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
.orElseThrow(() -> new CollectorException(String.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT))); .orElseThrow(
() -> new CollectorException(String
.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
log.info("splitOnElement: {}", splitOnElement); log.info("splitOnElement: {}", splitOnElement);
@ -63,10 +66,10 @@ public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis); Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
return StreamSupport.stream( return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED), Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
false false);
);
} }
abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException; abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.collection.plugin.file; package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.common.collection.CollectorException; import java.io.BufferedInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream; import eu.dnetlib.dhp.common.collection.CollectorException;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin { public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {

View File

@ -1,13 +1,15 @@
package eu.dnetlib.dhp.collection.plugin.file; package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.common.collection.CollectorException; import java.io.BufferedInputStream;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream; import eu.dnetlib.dhp.common.collection.CollectorException;
import java.util.zip.GZIPInputStream;
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin { public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.collection.plugin.utils; package eu.dnetlib.dhp.collection.plugin.utils;
import java.io.InputStream; import java.io.InputStream;
@ -100,7 +101,9 @@ public class XMLIterator implements Iterator<String> {
try { try {
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result); final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
final StartElement start = current.asStartElement(); final StartElement start = current.asStartElement();
final StartElement newRecord = eventFactory.get().createStartElement(start.getName(), start.getAttributes(), start.getNamespaces()); final StartElement newRecord = eventFactory
.get()
.createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
// new root record // new root record
writer.add(newRecord); writer.add(newRecord);
@ -139,14 +142,18 @@ public class XMLIterator implements Iterator<String> {
XMLEvent peek = parser.peek(); XMLEvent peek = parser.peek();
if (peek != null && peek.isStartElement()) { if (peek != null && peek.isStartElement()) {
String name = peek.asStartElement().getName().getLocalPart(); String name = peek.asStartElement().getName().getLocalPart();
if (element.equals(name)) { return peek; } if (element.equals(name)) {
return peek;
}
} }
while (parser.hasNext()) { while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent(); final XMLEvent event = parser.nextEvent();
if (event != null && event.isStartElement()) { if (event != null && event.isStartElement()) {
String name = event.asStartElement().getName().getLocalPart(); String name = event.asStartElement().getName().getLocalPart();
if (element.equals(name)) { return event; } if (element.equals(name)) {
return event;
}
} }
} }
return null; return null;

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.collection.plugin.file; package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.collection.ApiDescriptor; import java.io.IOException;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import java.util.HashMap;
import eu.dnetlib.dhp.common.collection.CollectorException; import java.util.stream.Stream;
import net.bytebuddy.asm.Advice;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.LocalFileSystem;
@ -13,9 +14,10 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import eu.dnetlib.dhp.collection.ApiDescriptor;
import java.util.HashMap; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import java.util.stream.Stream; import eu.dnetlib.dhp.common.collection.CollectorException;
import net.bytebuddy.asm.Advice;
public class FileCollectorPluginTest { public class FileCollectorPluginTest {
@ -57,4 +59,3 @@ public class FileCollectorPluginTest {
}); });
} }
} }

View File

@ -1,8 +1,13 @@
package eu.dnetlib.dhp.collection.plugin.file; package eu.dnetlib.dhp.collection.plugin.file;
import eu.dnetlib.dhp.collection.ApiDescriptor; import java.io.File;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import java.io.IOException;
import eu.dnetlib.dhp.common.collection.CollectorException; import java.nio.file.Files;
import java.util.HashMap;
import java.util.Objects;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.LocalFileSystem;
@ -13,13 +18,9 @@ import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.File; import eu.dnetlib.dhp.collection.ApiDescriptor;
import java.io.IOException; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import java.nio.file.Files; import eu.dnetlib.dhp.common.collection.CollectorException;
import java.util.HashMap;
import java.util.Objects;
import java.util.stream.Stream;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
@ -36,7 +37,9 @@ public class FileGZipCollectorPluginTest {
@BeforeEach @BeforeEach
public void setUp() throws IOException { public void setUp() throws IOException {
final String gzipFile = Objects.requireNonNull(this final String gzipFile = Objects
.requireNonNull(
this
.getClass() .getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz")) .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz"))
.getFile(); .getFile();