2019-04-03 16:05:16 +02:00
|
|
|
package eu.dnetlib.dhp.transformation;
|
2019-04-03 16:03:36 +02:00
|
|
|
|
|
|
|
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
2019-10-10 11:33:51 +02:00
|
|
|
import eu.dnetlib.dhp.transformation.functions.Cleaner;
|
2019-10-24 11:36:59 +02:00
|
|
|
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
2019-10-10 11:33:51 +02:00
|
|
|
import net.sf.saxon.s9api.*;
|
2019-04-03 16:03:36 +02:00
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
2019-04-03 16:05:16 +02:00
|
|
|
import org.apache.spark.util.LongAccumulator;
|
2019-04-03 16:03:36 +02:00
|
|
|
|
2019-04-11 15:39:29 +02:00
|
|
|
import javax.xml.transform.stream.StreamSource;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.StringWriter;
|
2019-10-24 11:36:59 +02:00
|
|
|
import java.util.Map;
|
2019-04-11 15:39:29 +02:00
|
|
|
|
2019-04-03 16:03:36 +02:00
|
|
|
public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
|
|
|
|
|
|
|
|
2019-04-03 16:05:16 +02:00
|
|
|
private final LongAccumulator totalItems;
|
2019-04-11 15:39:29 +02:00
|
|
|
private final LongAccumulator errorItems;
|
|
|
|
private final LongAccumulator transformedItems;
|
2019-10-24 11:36:59 +02:00
|
|
|
private final String transformationRule;
|
|
|
|
private final Cleaner cleanFunction;
|
2019-04-11 15:39:29 +02:00
|
|
|
|
2019-10-10 11:33:51 +02:00
|
|
|
|
2019-04-11 15:39:29 +02:00
|
|
|
private final long dateOfTransformation;
|
|
|
|
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2019-10-24 11:36:59 +02:00
|
|
|
public TransformFunction(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator transformedItems, final String transformationRule, long dateOfTransformation, final Map<String, Vocabulary> vocabularies) throws Exception {
|
2019-04-03 16:05:16 +02:00
|
|
|
this.totalItems= totalItems;
|
2019-04-11 15:39:29 +02:00
|
|
|
this.errorItems = errorItems;
|
|
|
|
this.transformedItems = transformedItems;
|
2019-10-24 11:36:59 +02:00
|
|
|
this.transformationRule = transformationRule;
|
2019-04-11 15:39:29 +02:00
|
|
|
this.dateOfTransformation = dateOfTransformation;
|
2019-10-24 11:36:59 +02:00
|
|
|
cleanFunction = new Cleaner(vocabularies);
|
2019-04-03 16:05:16 +02:00
|
|
|
}
|
|
|
|
|
2019-04-03 16:03:36 +02:00
|
|
|
@Override
|
2019-04-11 15:39:29 +02:00
|
|
|
public MetadataRecord call(MetadataRecord value) {
|
2019-04-03 16:05:16 +02:00
|
|
|
totalItems.add(1);
|
2019-04-11 15:39:29 +02:00
|
|
|
try {
|
2019-10-10 11:33:51 +02:00
|
|
|
Processor processor = new Processor(false);
|
|
|
|
processor.registerExtensionFunction(cleanFunction);
|
|
|
|
final XsltCompiler comp = processor.newXsltCompiler();
|
2019-10-24 11:36:59 +02:00
|
|
|
XsltExecutable xslt = comp.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes())));
|
2019-10-10 11:33:51 +02:00
|
|
|
XdmNode source = processor.newDocumentBuilder().build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes())));
|
|
|
|
XsltTransformer trans = xslt.load();
|
|
|
|
trans.setInitialContextNode(source);
|
2019-04-11 15:39:29 +02:00
|
|
|
final StringWriter output = new StringWriter();
|
2019-10-10 11:33:51 +02:00
|
|
|
Serializer out = processor.newSerializer(output);
|
|
|
|
out.setOutputProperty(Serializer.Property.METHOD,"xml");
|
|
|
|
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
|
|
|
trans.setDestination(out);
|
|
|
|
trans.transform();
|
2019-04-11 15:39:29 +02:00
|
|
|
final String xml = output.toString();
|
|
|
|
value.setBody(xml);
|
2019-10-24 11:36:59 +02:00
|
|
|
value.setDateOfTransformation(dateOfTransformation);
|
2019-04-11 15:39:29 +02:00
|
|
|
transformedItems.add(1);
|
|
|
|
return value;
|
|
|
|
}catch (Throwable e) {
|
|
|
|
errorItems.add(1);
|
|
|
|
return null;
|
|
|
|
}
|
2019-04-03 16:03:36 +02:00
|
|
|
}
|
2019-10-10 11:33:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2019-04-11 15:39:29 +02:00
|
|
|
}
|