Removed old messaging system not quite used from collection and Transformation workflow
code refactorpull/91/head
parent
184e7b3856
commit
98b9498b57
@ -1,45 +1,45 @@
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
public class AggregationCounter implements Serializable {
|
||||
private LongAccumulator totalItems;
|
||||
private LongAccumulator errorItems;
|
||||
private LongAccumulator processedItems;
|
||||
|
||||
public AggregationCounter() {
|
||||
}
|
||||
|
||||
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
|
||||
this.totalItems = totalItems;
|
||||
this.errorItems = errorItems;
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(LongAccumulator totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getErrorItems() {
|
||||
return errorItems;
|
||||
}
|
||||
|
||||
public void setErrorItems(LongAccumulator errorItems) {
|
||||
this.errorItems = errorItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(LongAccumulator processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
private LongAccumulator totalItems;
|
||||
private LongAccumulator errorItems;
|
||||
private LongAccumulator processedItems;
|
||||
|
||||
public AggregationCounter() {
|
||||
}
|
||||
|
||||
public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) {
|
||||
this.totalItems = totalItems;
|
||||
this.errorItems = errorItems;
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(LongAccumulator totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getErrorItems() {
|
||||
return errorItems;
|
||||
}
|
||||
|
||||
public void setErrorItems(LongAccumulator errorItems) {
|
||||
this.errorItems = errorItems;
|
||||
}
|
||||
|
||||
public LongAccumulator getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(LongAccumulator processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,93 @@
|
||||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
|
||||
public class CollectorWorker {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class);
|
||||
|
||||
private final CollectorPluginFactory collectorPluginFactory;
|
||||
|
||||
private final ApiDescriptor api;
|
||||
|
||||
private final String hdfsuri;
|
||||
|
||||
private final String hdfsPath;
|
||||
|
||||
public CollectorWorker(
|
||||
final CollectorPluginFactory collectorPluginFactory,
|
||||
final ApiDescriptor api,
|
||||
final String hdfsuri,
|
||||
final String hdfsPath) {
|
||||
this.collectorPluginFactory = collectorPluginFactory;
|
||||
this.api = api;
|
||||
this.hdfsuri = hdfsuri;
|
||||
this.hdfsPath = hdfsPath;
|
||||
|
||||
}
|
||||
|
||||
public void collect() throws CollectorException {
|
||||
try {
|
||||
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem.get(URI.create(hdfsuri), conf);
|
||||
Path hdfswritepath = new Path(hdfsPath);
|
||||
|
||||
log.info("Created path " + hdfswritepath.toString());
|
||||
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(api)
|
||||
.forEach(
|
||||
content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new CollectorException("Error on collecting ", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,55 @@
|
||||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
|
||||
/**
|
||||
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
|
||||
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
|
||||
* plugin to use and where store the data into HDFS path
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class CollectorWorkerApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class);
|
||||
|
||||
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
|
||||
|
||||
/**
|
||||
* @param args
|
||||
*/
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CollectorWorker.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/collector_parameter.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
|
||||
log.info("hdfsURI is {}", hdfsuri);
|
||||
final String hdfsPath = argumentParser.get("hdfsPath");
|
||||
log.info("hdfsPath is {}" + hdfsPath);
|
||||
final String apiDescriptor = argumentParser.get("apidescriptor");
|
||||
log.info("apiDescriptor is {}" + apiDescriptor);
|
||||
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
|
||||
final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class);
|
||||
|
||||
final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, hdfsPath);
|
||||
worker.collect();
|
||||
}
|
||||
}
|
@ -1,139 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
import eu.dnetlib.message.MessageType;
|
||||
|
||||
public class DnetCollectorWorker {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);
|
||||
|
||||
private final CollectorPluginFactory collectorPluginFactory;
|
||||
|
||||
private final ArgumentApplicationParser argumentParser;
|
||||
|
||||
private final MessageManager manager;
|
||||
|
||||
public DnetCollectorWorker(
|
||||
final CollectorPluginFactory collectorPluginFactory,
|
||||
final ArgumentApplicationParser argumentParser,
|
||||
final MessageManager manager)
|
||||
throws DnetCollectorException {
|
||||
this.collectorPluginFactory = collectorPluginFactory;
|
||||
this.argumentParser = argumentParser;
|
||||
this.manager = manager;
|
||||
}
|
||||
|
||||
public void collect() throws DnetCollectorException {
|
||||
try {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class);
|
||||
|
||||
final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol());
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS"));
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem.get(URI.create(hdfsuri), conf);
|
||||
Path hdfswritepath = new Path(argumentParser.get("hdfsPath"));
|
||||
|
||||
log.info("Created path " + hdfswritepath.toString());
|
||||
|
||||
final Map<String, String> ongoingMap = new HashMap<>();
|
||||
final Map<String, String> reportMap = new HashMap<>();
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
plugin
|
||||
.collect(api)
|
||||
.forEach(
|
||||
content -> {
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
if (counter.get() % 10 == 0) {
|
||||
try {
|
||||
ongoingMap.put("ongoing", "" + counter.get());
|
||||
log
|
||||
.debug(
|
||||
"Sending message: "
|
||||
+ manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"),
|
||||
"Collection",
|
||||
MessageType.ONGOING,
|
||||
ongoingMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false));
|
||||
} catch (Exception e) {
|
||||
log.error("Error on sending message ", e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
ongoingMap.put("ongoing", "" + counter.get());
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
reportMap.put("collected", "" + counter.get());
|
||||
manager
|
||||
.sendMessage(
|
||||
new Message(
|
||||
argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap),
|
||||
argumentParser.get("rabbitOngoingQueue"),
|
||||
true,
|
||||
false);
|
||||
manager.close();
|
||||
} catch (Throwable e) {
|
||||
throw new DnetCollectorException("Error on collecting ", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.collection.worker;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
|
||||
/**
|
||||
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
|
||||
* will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector
|
||||
* plugin to use and where store the data into HDFS path
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class DnetCollectorWorkerApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
|
||||
|
||||
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
|
||||
|
||||
private static ArgumentApplicationParser argumentParser;
|
||||
|
||||
/** @param args */
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
DnetCollectorWorker.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/collector/worker/collector_parameter.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
log.info("hdfsPath =" + argumentParser.get("hdfsPath"));
|
||||
log.info("json = " + argumentParser.get("apidescriptor"));
|
||||
final MessageManager manager = new MessageManager(
|
||||
argumentParser.get("rabbitHost"),
|
||||
argumentParser.get("rabbitUser"),
|
||||
argumentParser.get("rabbitPassword"),
|
||||
false,
|
||||
false,
|
||||
null);
|
||||
final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager);
|
||||
worker.collect();
|
||||
}
|
||||
}
|
@ -1,28 +1,29 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation;
|
||||
|
||||
public class DnetTransformationException extends Exception {
|
||||
|
||||
public DnetTransformationException() {
|
||||
super();
|
||||
}
|
||||
public DnetTransformationException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DnetTransformationException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
public DnetTransformationException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
public DnetTransformationException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
public DnetTransformationException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public DnetTransformationException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
public DnetTransformationException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
}
|
||||
|
@ -1,62 +1,69 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class TransformationFactory {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
|
||||
public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()";
|
||||
|
||||
|
||||
public static MapFunction<MetadataRecord, MetadataRecord> getTransformationPlugin(final Map<String,String> jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) throws DnetTransformationException {
|
||||
|
||||
try {
|
||||
final String transformationPlugin = jobArgument.get("transformationPlugin");
|
||||
|
||||
log.info("Transformation plugin required "+transformationPlugin);
|
||||
switch (transformationPlugin) {
|
||||
case "XSLT_TRANSFORM": {
|
||||
final String transformationRuleName = jobArgument.get("transformationRule");
|
||||
if (StringUtils.isBlank(transformationRuleName))
|
||||
throw new DnetTransformationException("Missing Parameter transformationRule");
|
||||
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
final String transformationRule = queryTransformationRuleFromIS(transformationRuleName, isLookupService);
|
||||
|
||||
final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
|
||||
return new XSLTTransformationFunction(counters,transformationRule,dateOfTransformation,vocabularies);
|
||||
|
||||
}
|
||||
default:
|
||||
throw new DnetTransformationException("transformation plugin does not exists for " + transformationPlugin);
|
||||
|
||||
}
|
||||
|
||||
} catch (Throwable e) {
|
||||
throw new DnetTransformationException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static String queryTransformationRuleFromIS(final String transformationRuleName, final ISLookUpService isLookUpService) throws Exception {
|
||||
final String query = String.format(TRULE_XQUERY, transformationRuleName);
|
||||
log.info("asking query to IS: "+ query);
|
||||
List<String> result = isLookUpService.quickSearchProfile(query);
|
||||
|
||||
if (result==null || result.isEmpty())
|
||||
throw new DnetTransformationException("Unable to find transformation rule with name: "+ transformationRuleName);
|
||||
return result.get(0);
|
||||
}
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class);
|
||||
public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()";
|
||||
|
||||
public static MapFunction<MetadataRecord, MetadataRecord> getTransformationPlugin(
|
||||
final Map<String, String> jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService)
|
||||
throws DnetTransformationException {
|
||||
|
||||
try {
|
||||
final String transformationPlugin = jobArgument.get("transformationPlugin");
|
||||
|
||||
log.info("Transformation plugin required " + transformationPlugin);
|
||||
switch (transformationPlugin) {
|
||||
case "XSLT_TRANSFORM": {
|
||||
final String transformationRuleName = jobArgument.get("transformationRuleTitle");
|
||||
if (StringUtils.isBlank(transformationRuleName))
|
||||
throw new DnetTransformationException("Missing Parameter transformationRule");
|
||||
final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
final String transformationRule = queryTransformationRuleFromIS(
|
||||
transformationRuleName, isLookupService);
|
||||
|
||||
final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation"));
|
||||
return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation,
|
||||
vocabularies);
|
||||
|
||||
}
|
||||
default:
|
||||
throw new DnetTransformationException(
|
||||
"transformation plugin does not exists for " + transformationPlugin);
|
||||
|
||||
}
|
||||
|
||||
} catch (Throwable e) {
|
||||
throw new DnetTransformationException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static String queryTransformationRuleFromIS(final String transformationRuleName,
|
||||
final ISLookUpService isLookUpService) throws Exception {
|
||||
final String query = String.format(TRULE_XQUERY, transformationRuleName);
|
||||
log.info("asking query to IS: " + query);
|
||||
List<String> result = isLookUpService.quickSearchProfile(query);
|
||||
|
||||
if (result == null || result.isEmpty())
|
||||
throw new DnetTransformationException(
|
||||
"Unable to find transformation rule with name: " + transformationRuleName);
|
||||
return result.get(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,66 +1,68 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation.xslt;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
||||
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import net.sf.saxon.s9api.*;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringWriter;
|
||||
|
||||
public class XSLTTransformationFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
||||
|
||||
private final AggregationCounter aggregationCounter;
|
||||
private final AggregationCounter aggregationCounter;
|
||||
|
||||
private final String transformationRule;
|
||||
private final String transformationRule;
|
||||
|
||||
private final Cleaner cleanFunction;
|
||||
private final Cleaner cleanFunction;
|
||||
|
||||
private final long dateOfTransformation;
|
||||
private final long dateOfTransformation;
|
||||
|
||||
public XSLTTransformationFunction(
|
||||
final AggregationCounter aggregationCounter,
|
||||
final String transformationRule,
|
||||
long dateOfTransformation,
|
||||
final VocabularyGroup vocabularies)
|
||||
throws Exception {
|
||||
this.aggregationCounter = aggregationCounter;
|
||||
this.transformationRule = transformationRule;
|
||||
this.dateOfTransformation = dateOfTransformation;
|
||||
cleanFunction = new Cleaner(vocabularies);
|
||||
}
|
||||
public XSLTTransformationFunction(
|
||||
final AggregationCounter aggregationCounter,
|
||||
final String transformationRule,
|
||||
long dateOfTransformation,
|
||||
final VocabularyGroup vocabularies)
|
||||
throws Exception {
|
||||
this.aggregationCounter = aggregationCounter;
|
||||
this.transformationRule = transformationRule;
|
||||
this.dateOfTransformation = dateOfTransformation;
|
||||
cleanFunction = new Cleaner(vocabularies);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MetadataRecord call(MetadataRecord value) {
|
||||
aggregationCounter.getTotalItems().add(1);
|
||||
try {
|
||||
Processor processor = new Processor(false);
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
XsltExecutable xslt = comp
|
||||
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
||||
XdmNode source = processor
|
||||
.newDocumentBuilder()
|
||||
.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
|
||||
XsltTransformer trans = xslt.load();
|
||||
trans.setInitialContextNode(source);
|
||||
final StringWriter output = new StringWriter();
|
||||
Serializer out = processor.newSerializer(output);
|
||||
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
||||
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
||||
trans.setDestination(out);
|
||||
trans.transform();
|
||||
final String xml = output.toString();
|
||||
value.setBody(xml);
|
||||
value.setDateOfTransformation(dateOfTransformation);
|
||||
aggregationCounter.getProcessedItems().add(1);
|
||||
return value;
|
||||
} catch (Throwable e) {
|
||||
aggregationCounter.getErrorItems().add(1);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public MetadataRecord call(MetadataRecord value) {
|
||||
aggregationCounter.getTotalItems().add(1);
|
||||
try {
|
||||
Processor processor = new Processor(false);
|
||||
processor.registerExtensionFunction(cleanFunction);
|
||||
final XsltCompiler comp = processor.newXsltCompiler();
|
||||
XsltExecutable xslt = comp
|
||||
.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
||||
XdmNode source = processor
|
||||
.newDocumentBuilder()
|
||||
.build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
|
||||
XsltTransformer trans = xslt.load();
|
||||
trans.setInitialContextNode(source);
|
||||
final StringWriter output = new StringWriter();
|
||||
Serializer out = processor.newSerializer(output);
|
||||
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
||||
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
||||
trans.setDestination(out);
|
||||
trans.transform();
|
||||
final String xml = output.toString();
|
||||
value.setBody(xml);
|
||||
value.setDateOfTransformation(dateOfTransformation);
|
||||
aggregationCounter.getProcessedItems().add(1);
|
||||
return value;
|
||||
} catch (Throwable e) {
|
||||
aggregationCounter.getErrorItems().add(1);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,6 @@
|
||||
[
|
||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": false}
|
||||
]
|
@ -0,0 +1,18 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
@ -1,12 +0,0 @@
|
||||
[
|
||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
||||
{"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true},
|
||||
{"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true},
|
||||
{"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true},
|
||||
{"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true},
|
||||
{"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true},
|
||||
{"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true}
|
||||
]
|
@ -0,0 +1,18 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
Loading…
Reference in New Issue