Merge branch 'hadoop_aggregator' of code-repo.d4science.org:D-Net/dnet-hadoop into hadoop_aggregator

This commit is contained in:
Michele Artini 2021-02-04 09:46:13 +01:00
commit 3ea8c328ac
3 changed files with 11 additions and 9 deletions

View File

@ -149,14 +149,14 @@ public class OaiIterator implements Iterator<String> {
try {
doc = reader.read(new StringReader(xml));
} catch (final DocumentException e) {
log.warn("Error parsing xml, I try to clean it: " + xml, e);
log.warn("Error parsing xml, I try to clean it. {}", e.getMessage());
final String cleaned = XmlCleaner.cleanAllEntities(xml);
try {
doc = reader.read(new StringReader(cleaned));
} catch (final DocumentException e1) {
final String resumptionToken = extractResumptionToken(xml);
if (resumptionToken == null) {
throw new CollectorException("Error parsing cleaned document:" + cleaned, e1);
throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1);
}
return resumptionToken;
}

View File

@ -19,16 +19,19 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
/**
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
* plugin to use and where store the data into HDFS path
* CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes
* into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection
* oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the
* relative specific configurations
*
* @author Sandro La Bruzzo
* @author Sandro La Bruzzo, Claudio Atzori
*/
public class CollectorWorkerApplication {
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
public static final String COLLECTOR_WORKER_ERRORS = "collectorWorker-errors";
/**
* @param args
*/
@ -60,7 +63,7 @@ public class CollectorWorkerApplication {
final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath);
CollectorPluginErrorLogList errors = worker.collect();
populateOOZIEEnv("collectorErrors", errors.toString());
populateOOZIEEnv(COLLECTOR_WORKER_ERRORS, errors.toString());
}

View File

@ -87,6 +87,7 @@
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<capture-output/>
</java>
<ok to="GenerateNativeStoreSparkJob"/>
<error to="FailCollection"/>
@ -133,7 +134,6 @@
<arg>--action</arg><arg>READ_UNLOCK</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
<capture-output/>
</java>
<ok to="CommitVersion"/>
<error to="Kill"/>
@ -165,7 +165,6 @@
<arg>--action</arg><arg>READ_UNLOCK</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
<capture-output/>
</java>
<ok to="RollBack"/>
<error to="Kill"/>