forked from antonis.lempesis/dnet-hadoop
better logging, WIP: collectorWorker error reporting
This commit is contained in:
parent
53884d12c2
commit
0e8a4f9f1a
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.application;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.Properties;
|
||||||
|
|
||||||
|
public class ApplicationUtils {
|
||||||
|
|
||||||
|
public static void populateOOZIEEnv(final String paramName, String value) throws Exception {
|
||||||
|
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||||
|
Properties props = new Properties();
|
||||||
|
|
||||||
|
props.setProperty(paramName, value);
|
||||||
|
OutputStream os = new FileOutputStream(file);
|
||||||
|
props.store(os, "");
|
||||||
|
os.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.aggregation.mdstore;
|
package eu.dnetlib.dhp.aggregation.mdstore;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*;
|
||||||
|
import static eu.dnetlib.dhp.application.ApplicationUtils.*;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
@ -16,11 +19,8 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.collection.worker.CollectorWorker;
|
|
||||||
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
||||||
|
|
||||||
public class MDStoreActionNode {
|
public class MDStoreActionNode {
|
||||||
|
@ -28,11 +28,8 @@ public class MDStoreActionNode {
|
||||||
|
|
||||||
enum MDAction {
|
enum MDAction {
|
||||||
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
|
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final ObjectMapper mapper = new ObjectMapper();
|
|
||||||
|
|
||||||
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
|
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
|
||||||
|
|
||||||
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
|
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
|
||||||
|
@ -48,13 +45,13 @@ public class MDStoreActionNode {
|
||||||
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CollectorWorker.class
|
MDStoreActionNode.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
|
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
|
||||||
argumentParser.parseArgument(args);
|
argumentParser.parseArgument(args);
|
||||||
|
|
||||||
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
|
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
|
||||||
log.info("Curren action is {}", action);
|
log.info("Current action is {}", action);
|
||||||
|
|
||||||
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
|
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
|
||||||
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
|
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
|
||||||
|
@ -67,7 +64,7 @@ public class MDStoreActionNode {
|
||||||
}
|
}
|
||||||
final MDStoreVersion currentVersion = DNetRestClient
|
final MDStoreVersion currentVersion = DNetRestClient
|
||||||
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||||
populateOOZIEEnv(MDSTOREVERSIONPARAM, mapper.writeValueAsString(currentVersion));
|
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case COMMIT: {
|
case COMMIT: {
|
||||||
|
@ -77,7 +74,7 @@ public class MDStoreActionNode {
|
||||||
throw new IllegalArgumentException("missing or empty argument namenode");
|
throw new IllegalArgumentException("missing or empty argument namenode");
|
||||||
}
|
}
|
||||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||||
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
|
@ -110,7 +107,7 @@ public class MDStoreActionNode {
|
||||||
}
|
}
|
||||||
case ROLLBACK: {
|
case ROLLBACK: {
|
||||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||||
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
|
@ -127,12 +124,12 @@ public class MDStoreActionNode {
|
||||||
}
|
}
|
||||||
final MDStoreVersion currentVersion = DNetRestClient
|
final MDStoreVersion currentVersion = DNetRestClient
|
||||||
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||||
populateOOZIEEnv(MDSTOREREADLOCKPARAM, mapper.writeValueAsString(currentVersion));
|
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case READ_UNLOCK: {
|
case READ_UNLOCK: {
|
||||||
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
|
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
|
||||||
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||||
|
|
||||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
|
@ -148,13 +145,4 @@ public class MDStoreActionNode {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void populateOOZIEEnv(final String paramName, String value) throws Exception {
|
|
||||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
|
||||||
Properties props = new Properties();
|
|
||||||
|
|
||||||
props.setProperty(paramName, value);
|
|
||||||
OutputStream os = new FileOutputStream(file);
|
|
||||||
props.store(os, "");
|
|
||||||
os.close();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,9 +4,12 @@ package eu.dnetlib.dhp.collection.plugin;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
||||||
|
|
||||||
public interface CollectorPlugin {
|
public interface CollectorPlugin {
|
||||||
|
|
||||||
Stream<String> collect(ApiDescriptor api) throws CollectorException;
|
Stream<String> collect(ApiDescriptor api) throws CollectorException;
|
||||||
|
|
||||||
|
CollectorPluginErrorLogList getCollectionErrors();
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,12 +9,15 @@ import java.util.Spliterators;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterators;
|
import com.google.common.collect.Iterators;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
||||||
|
|
||||||
public class OaiCollectorPlugin implements CollectorPlugin {
|
public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
|
@ -26,8 +29,19 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
|
|
||||||
private OaiIteratorFactory oaiIteratorFactory;
|
private OaiIteratorFactory oaiIteratorFactory;
|
||||||
|
|
||||||
|
private final CollectorPluginErrorLogList errorLogList = new CollectorPluginErrorLogList();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Stream<String> collect(final ApiDescriptor api) throws CollectorException {
|
public Stream<String> collect(final ApiDescriptor api) throws CollectorException {
|
||||||
|
try {
|
||||||
|
return doCollect(api);
|
||||||
|
} catch (CollectorException e) {
|
||||||
|
errorLogList.add(e.getMessage());
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Stream<String> doCollect(ApiDescriptor api) throws CollectorException {
|
||||||
final String baseUrl = api.getBaseUrl();
|
final String baseUrl = api.getBaseUrl();
|
||||||
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
||||||
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
||||||
|
@ -65,7 +79,7 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
.stream()
|
.stream()
|
||||||
.map(
|
.map(
|
||||||
set -> getOaiIteratorFactory()
|
set -> getOaiIteratorFactory()
|
||||||
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
.newIterator(baseUrl, mdFormat, set, fromDate, untilDate, errorLogList))
|
||||||
.iterator();
|
.iterator();
|
||||||
|
|
||||||
return StreamSupport
|
return StreamSupport
|
||||||
|
@ -79,4 +93,9 @@ public class OaiCollectorPlugin implements CollectorPlugin {
|
||||||
}
|
}
|
||||||
return oaiIteratorFactory;
|
return oaiIteratorFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public CollectorPluginErrorLogList getCollectionErrors() {
|
||||||
|
return errorLogList;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,15 +15,17 @@ import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
import org.dom4j.io.SAXReader;
|
import org.dom4j.io.SAXReader;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
|
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
|
||||||
|
|
||||||
public class OaiIterator implements Iterator<String> {
|
public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on
|
private static final Logger log = LoggerFactory.getLogger(OaiIterator.class);
|
||||||
// 11/24/08 5:02 PM
|
|
||||||
|
|
||||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||||
private final SAXReader reader = new SAXReader();
|
private final SAXReader reader = new SAXReader();
|
||||||
|
@ -36,6 +38,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
private String token;
|
private String token;
|
||||||
private boolean started;
|
private boolean started;
|
||||||
private final HttpConnector httpConnector;
|
private final HttpConnector httpConnector;
|
||||||
|
private CollectorPluginErrorLogList errorLogList;
|
||||||
|
|
||||||
public OaiIterator(
|
public OaiIterator(
|
||||||
final String baseUrl,
|
final String baseUrl,
|
||||||
|
@ -43,7 +46,8 @@ public class OaiIterator implements Iterator<String> {
|
||||||
final String set,
|
final String set,
|
||||||
final String fromDate,
|
final String fromDate,
|
||||||
final String untilDate,
|
final String untilDate,
|
||||||
final HttpConnector httpConnector) {
|
final HttpConnector httpConnector,
|
||||||
|
final CollectorPluginErrorLogList errorLogList) {
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.mdFormat = mdFormat;
|
this.mdFormat = mdFormat;
|
||||||
this.set = set;
|
this.set = set;
|
||||||
|
@ -51,6 +55,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
this.untilDate = untilDate;
|
this.untilDate = untilDate;
|
||||||
this.started = false;
|
this.started = false;
|
||||||
this.httpConnector = httpConnector;
|
this.httpConnector = httpConnector;
|
||||||
|
this.errorLogList = errorLogList;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyStarted() {
|
private void verifyStarted() {
|
||||||
|
@ -139,7 +144,7 @@ public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
private String downloadPage(final String url) throws CollectorException {
|
private String downloadPage(final String url) throws CollectorException {
|
||||||
|
|
||||||
final String xml = httpConnector.getInputSource(url);
|
final String xml = httpConnector.getInputSource(url, errorLogList);
|
||||||
Document doc;
|
Document doc;
|
||||||
try {
|
try {
|
||||||
doc = reader.read(new StringReader(xml));
|
doc = reader.read(new StringReader(xml));
|
||||||
|
@ -174,4 +179,8 @@ public class OaiIterator implements Iterator<String> {
|
||||||
|
|
||||||
return doc.valueOf("//*[local-name()='resumptionToken']");
|
return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CollectorPluginErrorLogList getErrorLogList() {
|
||||||
|
return errorLogList;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.oai;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
|
||||||
|
|
||||||
public class OaiIteratorFactory {
|
public class OaiIteratorFactory {
|
||||||
|
@ -14,8 +15,9 @@ public class OaiIteratorFactory {
|
||||||
final String mdFormat,
|
final String mdFormat,
|
||||||
final String set,
|
final String set,
|
||||||
final String fromDate,
|
final String fromDate,
|
||||||
final String untilDate) {
|
final String untilDate,
|
||||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector());
|
final CollectorPluginErrorLogList errorLogList) {
|
||||||
|
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(), errorLogList);
|
||||||
}
|
}
|
||||||
|
|
||||||
private HttpConnector getHttpConnector() {
|
private HttpConnector getHttpConnector() {
|
||||||
|
|
|
@ -15,6 +15,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||||
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
||||||
|
|
||||||
|
@ -22,69 +23,65 @@ public class CollectorWorker {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
|
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
|
||||||
|
|
||||||
private final CollectorPluginFactory collectorPluginFactory;
|
|
||||||
|
|
||||||
private final ApiDescriptor api;
|
private final ApiDescriptor api;
|
||||||
|
|
||||||
private final String hdfsuri;
|
private final String hdfsuri;
|
||||||
|
|
||||||
private final String hdfsPath;
|
private final String hdfsPath;
|
||||||
|
|
||||||
|
private CollectorPlugin plugin;
|
||||||
|
|
||||||
public CollectorWorker(
|
public CollectorWorker(
|
||||||
final CollectorPluginFactory collectorPluginFactory,
|
|
||||||
final ApiDescriptor api,
|
final ApiDescriptor api,
|
||||||
final String hdfsuri,
|
final String hdfsuri,
|
||||||
final String hdfsPath) {
|
final String hdfsPath) throws CollectorException {
|
||||||
this.collectorPluginFactory = collectorPluginFactory;
|
|
||||||
this.api = api;
|
this.api = api;
|
||||||
this.hdfsuri = hdfsuri;
|
this.hdfsuri = hdfsuri;
|
||||||
this.hdfsPath = hdfsPath;
|
this.hdfsPath = hdfsPath;
|
||||||
|
this.plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void collect() throws CollectorException {
|
public CollectorPluginErrorLogList collect() throws IOException, CollectorException {
|
||||||
try {
|
|
||||||
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
|
||||||
|
|
||||||
// ====== Init HDFS File System Object
|
// ====== Init HDFS File System Object
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
// Set FileSystem URI
|
// Set FileSystem URI
|
||||||
conf.set("fs.defaultFS", hdfsuri);
|
conf.set("fs.defaultFS", hdfsuri);
|
||||||
// Because of Maven
|
// Because of Maven
|
||||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
|
|
||||||
System.setProperty("hadoop.home.dir", "/");
|
System.setProperty("hadoop.home.dir", "/");
|
||||||
// Get the filesystem - HDFS
|
// Get the filesystem - HDFS
|
||||||
FileSystem.get(URI.create(hdfsuri), conf);
|
|
||||||
Path hdfswritepath = new Path(hdfsPath);
|
|
||||||
|
|
||||||
log.info("Created path " + hdfswritepath.toString());
|
FileSystem.get(URI.create(hdfsuri), conf);
|
||||||
|
Path hdfswritepath = new Path(hdfsPath);
|
||||||
|
|
||||||
final AtomicInteger counter = new AtomicInteger(0);
|
log.info("Created path " + hdfswritepath.toString());
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
|
||||||
.createWriter(
|
final AtomicInteger counter = new AtomicInteger(0);
|
||||||
conf,
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
SequenceFile.Writer.file(hdfswritepath),
|
.createWriter(
|
||||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
conf,
|
||||||
SequenceFile.Writer.valueClass(Text.class))) {
|
SequenceFile.Writer.file(hdfswritepath),
|
||||||
final IntWritable key = new IntWritable(counter.get());
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
final Text value = new Text();
|
SequenceFile.Writer.valueClass(Text.class))) {
|
||||||
plugin
|
final IntWritable key = new IntWritable(counter.get());
|
||||||
.collect(api)
|
final Text value = new Text();
|
||||||
.forEach(
|
plugin
|
||||||
content -> {
|
.collect(api)
|
||||||
key.set(counter.getAndIncrement());
|
.forEach(
|
||||||
value.set(content);
|
content -> {
|
||||||
try {
|
key.set(counter.getAndIncrement());
|
||||||
writer.append(key, value);
|
value.set(content);
|
||||||
} catch (IOException e) {
|
try {
|
||||||
throw new RuntimeException(e);
|
writer.append(key, value);
|
||||||
}
|
} catch (IOException e) {
|
||||||
});
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
} catch (Throwable e) {
|
});
|
||||||
throw new CollectorException("Error on collecting ", e);
|
} finally {
|
||||||
|
return plugin.getCollectionErrors();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
package eu.dnetlib.dhp.collection.worker;
|
package eu.dnetlib.dhp.collection.worker;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*;
|
import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*;
|
||||||
|
import static eu.dnetlib.dhp.application.ApplicationUtils.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -10,7 +12,9 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||||
|
import eu.dnetlib.dhp.aggregation.common.AggregationUtility;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
||||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||||
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
||||||
|
|
||||||
|
@ -25,8 +29,6 @@ public class CollectorWorkerApplication {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
||||||
|
|
||||||
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param args
|
* @param args
|
||||||
*/
|
*/
|
||||||
|
@ -49,14 +51,16 @@ public class CollectorWorkerApplication {
|
||||||
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
|
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
|
||||||
log.info("mdStoreVersion is {}", mdStoreVersion);
|
log.info("mdStoreVersion is {}", mdStoreVersion);
|
||||||
|
|
||||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
||||||
|
final String hdfsPath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
||||||
|
log.info("hdfs path is {}", hdfsPath);
|
||||||
|
|
||||||
final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class);
|
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
|
||||||
|
|
||||||
final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class);
|
final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath);
|
||||||
final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri,
|
CollectorPluginErrorLogList errors = worker.collect();
|
||||||
currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME);
|
|
||||||
worker.collect();
|
populateOOZIEEnv("collectorErrors", errors.toString());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ import eu.dnetlib.dhp.collection.worker.CollectorException;
|
||||||
|
|
||||||
public class CollectorPluginFactory {
|
public class CollectorPluginFactory {
|
||||||
|
|
||||||
public CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException {
|
public static CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException {
|
||||||
if (protocol == null)
|
if (protocol == null)
|
||||||
throw new CollectorException("protocol cannot be null");
|
throw new CollectorException("protocol cannot be null");
|
||||||
switch (protocol.toLowerCase().trim()) {
|
switch (protocol.toLowerCase().trim()) {
|
||||||
|
|
|
@ -16,14 +16,14 @@ import javax.net.ssl.X509TrustManager;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.math.NumberUtils;
|
import org.apache.commons.lang.math.NumberUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.slf4j.Logger;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
import eu.dnetlib.dhp.collection.worker.CollectorException;
|
||||||
|
|
||||||
public class HttpConnector {
|
public class HttpConnector {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
private static final Logger log = LoggerFactory.getLogger(HttpConnector.class);
|
||||||
|
|
||||||
private int maxNumberOfRetry = 6;
|
private int maxNumberOfRetry = 6;
|
||||||
private int defaultDelay = 120; // seconds
|
private int defaultDelay = 120; // seconds
|
||||||
|
@ -45,7 +45,20 @@ public class HttpConnector {
|
||||||
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
||||||
*/
|
*/
|
||||||
public String getInputSource(final String requestUrl) throws CollectorException {
|
public String getInputSource(final String requestUrl) throws CollectorException {
|
||||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
return attemptDownloadAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given the URL returns the content via HTTP GET
|
||||||
|
*
|
||||||
|
* @param requestUrl the URL
|
||||||
|
* @param errorLogList the list of errors
|
||||||
|
* @return the content of the downloaded resource
|
||||||
|
* @throws CollectorException when retrying more than maxNumberOfRetry times
|
||||||
|
*/
|
||||||
|
public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList)
|
||||||
|
throws CollectorException {
|
||||||
|
return attemptDownloadAsString(requestUrl, 1, errorLogList);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -59,18 +72,20 @@ public class HttpConnector {
|
||||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private String attemptDownlaodAsString(
|
private String attemptDownloadAsString(
|
||||||
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||||
throws CollectorException {
|
throws CollectorException {
|
||||||
|
|
||||||
|
log.info("requesting URL [{}]", requestUrl);
|
||||||
try {
|
try {
|
||||||
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||||
try {
|
try {
|
||||||
return IOUtils.toString(s);
|
return IOUtils.toString(s);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
log.error("error while retrieving from http-connection occurred: {}", requestUrl, e);
|
||||||
Thread.sleep(defaultDelay * 1000);
|
Thread.sleep(defaultDelay * 1000);
|
||||||
errorList.add(e.getMessage());
|
errorList.add(e.getMessage());
|
||||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
return attemptDownloadAsString(requestUrl, retryNumber + 1, errorList);
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeQuietly(s);
|
IOUtils.closeQuietly(s);
|
||||||
}
|
}
|
||||||
|
@ -87,7 +102,7 @@ public class HttpConnector {
|
||||||
throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList);
|
throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
log.debug("requesting URL [{}], try {}", requestUrl, retryNumber);
|
||||||
try {
|
try {
|
||||||
InputStream input = null;
|
InputStream input = null;
|
||||||
|
|
||||||
|
@ -103,7 +118,7 @@ public class HttpConnector {
|
||||||
|
|
||||||
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
||||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
log.warn("waiting and repeating request after {} sec.", retryAfter);
|
||||||
Thread.sleep(retryAfter * 1000);
|
Thread.sleep(retryAfter * 1000);
|
||||||
errorList.add("503 Service Unavailable");
|
errorList.add("503 Service Unavailable");
|
||||||
urlConn.disconnect();
|
urlConn.disconnect();
|
||||||
|
@ -111,7 +126,7 @@ public class HttpConnector {
|
||||||
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
|
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM
|
||||||
|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
|
|| urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
|
||||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||||
log.debug("The requested url has been moved to " + newUrl);
|
log.debug("The requested url has been moved to {}", newUrl);
|
||||||
errorList
|
errorList
|
||||||
.add(
|
.add(
|
||||||
String
|
String
|
||||||
|
@ -121,15 +136,11 @@ public class HttpConnector {
|
||||||
urlConn.disconnect();
|
urlConn.disconnect();
|
||||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
||||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
||||||
log
|
final String msg = String
|
||||||
.error(
|
.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage());
|
||||||
String
|
log.error(msg);
|
||||||
.format(
|
|
||||||
"HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
Thread.sleep(defaultDelay * 1000);
|
Thread.sleep(defaultDelay * 1000);
|
||||||
errorList
|
errorList.add(msg);
|
||||||
.add(
|
|
||||||
String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
|
||||||
urlConn.disconnect();
|
urlConn.disconnect();
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||||
} else {
|
} else {
|
||||||
|
@ -138,7 +149,7 @@ public class HttpConnector {
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
log.error("error while retrieving from http-connection occurred: {}", requestUrl, e);
|
||||||
Thread.sleep(defaultDelay * 1000);
|
Thread.sleep(defaultDelay * 1000);
|
||||||
errorList.add(e.getMessage());
|
errorList.add(e.getMessage());
|
||||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||||
|
@ -149,12 +160,12 @@ public class HttpConnector {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
log.debug("StatusCode: {}", urlConn.getResponseMessage());
|
||||||
|
|
||||||
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||||
if (e.getKey() != null) {
|
if (e.getKey() != null) {
|
||||||
for (final String v : e.getValue()) {
|
for (final String v : e.getValue()) {
|
||||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
log.debug(" key: {} value: {}", e.getKey(), v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -183,37 +194,6 @@ public class HttpConnector {
|
||||||
"The requested url has been MOVED, but 'location' param is MISSING");
|
"The requested url has been MOVED, but 'location' param is MISSING");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
|
||||||
*/
|
|
||||||
public void initTrustManager() {
|
|
||||||
final X509TrustManager tm = new X509TrustManager() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public X509Certificate[] getAcceptedIssuers() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
try {
|
|
||||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
|
||||||
ctx.init(null, new TrustManager[] {
|
|
||||||
tm
|
|
||||||
}, null);
|
|
||||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
|
||||||
} catch (final GeneralSecurityException e) {
|
|
||||||
log.fatal(e);
|
|
||||||
throw new IllegalStateException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxNumberOfRetry() {
|
public int getMaxNumberOfRetry() {
|
||||||
return maxNumberOfRetry;
|
return maxNumberOfRetry;
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,7 +40,7 @@ public class DnetCollectorWorkerApplicationTests {
|
||||||
public void testFeeding(@TempDir Path testDir) throws Exception {
|
public void testFeeding(@TempDir Path testDir) throws Exception {
|
||||||
|
|
||||||
System.out.println(testDir.toString());
|
System.out.println(testDir.toString());
|
||||||
CollectorWorker worker = new CollectorWorker(new CollectorPluginFactory(), getApi(),
|
CollectorWorker worker = new CollectorWorker(getApi(),
|
||||||
"file://" + testDir.toString() + "/file.seq", testDir.toString() + "/file.seq");
|
"file://" + testDir.toString() + "/file.seq", testDir.toString() + "/file.seq");
|
||||||
worker.collect();
|
worker.collect();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue