2021-01-28 09:51:17 +01:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.collection.worker;
|
|
|
|
|
2021-02-02 12:28:21 +01:00
|
|
|
import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*;
|
2021-02-03 12:33:41 +01:00
|
|
|
import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*;
|
|
|
|
import static eu.dnetlib.dhp.application.ApplicationUtils.*;
|
2021-02-02 12:28:21 +01:00
|
|
|
|
2021-02-04 14:06:02 +01:00
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
import org.apache.commons.cli.ParseException;
|
2021-01-28 09:51:17 +01:00
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
2021-01-29 16:42:41 +01:00
|
|
|
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
2021-02-03 12:33:41 +01:00
|
|
|
import eu.dnetlib.dhp.aggregation.common.AggregationUtility;
|
2021-01-28 09:51:17 +01:00
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
2021-02-03 12:33:41 +01:00
|
|
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList;
|
2021-01-28 09:51:17 +01:00
|
|
|
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
2021-01-29 16:42:41 +01:00
|
|
|
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
|
2021-01-28 09:51:17 +01:00
|
|
|
|
|
|
|
/**
|
2021-02-03 17:58:22 +01:00
|
|
|
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
|
|
|
|
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
|
|
|
|
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
|
|
|
|
* relative specific configurations
|
2021-01-28 09:51:17 +01:00
|
|
|
*
|
2021-02-03 17:58:22 +01:00
|
|
|
* @author Sandro La Bruzzo, Claudio Atzori
|
2021-01-28 09:51:17 +01:00
|
|
|
*/
|
|
|
|
public class CollectorWorkerApplication {
|
|
|
|
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
|
|
|
|
2021-02-03 17:58:22 +01:00
|
|
|
public static final String COLLECTOR_WORKER_ERRORS = "collectorWorker-errors";
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
/**
|
|
|
|
* @param args
|
|
|
|
*/
|
2021-02-04 14:06:02 +01:00
|
|
|
public static void main(final String[] args) throws ParseException, IOException, CollectorException {
|
2021-01-28 09:51:17 +01:00
|
|
|
|
|
|
|
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
|
|
|
IOUtils
|
|
|
|
.toString(
|
|
|
|
CollectorWorker.class
|
|
|
|
.getResourceAsStream(
|
|
|
|
"/eu/dnetlib/dhp/collection/collector_parameter.json")));
|
|
|
|
argumentParser.parseArgument(args);
|
|
|
|
|
|
|
|
final String hdfsuri = argumentParser.get("namenode");
|
|
|
|
log.info("hdfsURI is {}", hdfsuri);
|
2021-01-29 16:42:41 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
final String apiDescriptor = argumentParser.get("apidescriptor");
|
2021-01-29 16:42:41 +01:00
|
|
|
log.info("apiDescriptor is {}", apiDescriptor);
|
|
|
|
|
|
|
|
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
|
|
|
|
log.info("mdStoreVersion is {}", mdStoreVersion);
|
2021-01-28 09:51:17 +01:00
|
|
|
|
2021-02-03 12:33:41 +01:00
|
|
|
final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class);
|
|
|
|
final String hdfsPath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
|
|
|
|
log.info("hdfs path is {}", hdfsPath);
|
|
|
|
|
|
|
|
final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class);
|
2021-01-28 09:51:17 +01:00
|
|
|
|
2021-02-03 12:33:41 +01:00
|
|
|
final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath);
|
|
|
|
CollectorPluginErrorLogList errors = worker.collect();
|
2021-01-28 09:51:17 +01:00
|
|
|
|
2021-02-03 17:58:22 +01:00
|
|
|
populateOOZIEEnv(COLLECTOR_WORKER_ERRORS, errors.toString());
|
2021-01-29 16:42:41 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
}
|
2021-01-29 16:42:41 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
}
|