2021-01-27 15:43:08 +01:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.transformation.xslt;
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.StringWriter;
|
2021-03-03 10:17:16 +01:00
|
|
|
import java.nio.charset.StandardCharsets;
|
2021-01-28 09:51:17 +01:00
|
|
|
|
|
|
|
import javax.xml.transform.stream.StreamSource;
|
|
|
|
|
2021-03-03 10:17:16 +01:00
|
|
|
import org.apache.commons.io.IOUtils;
|
2021-01-28 09:51:17 +01:00
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
|
|
|
2021-01-27 15:43:08 +01:00
|
|
|
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
|
|
|
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
2021-02-26 10:58:48 +01:00
|
|
|
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
|
2021-01-27 15:43:08 +01:00
|
|
|
import net.sf.saxon.s9api.*;
|
|
|
|
|
|
|
|
public class XSLTTransformationFunction implements MapFunction<MetadataRecord, MetadataRecord> {
|
|
|
|
|
2021-02-24 15:07:59 +01:00
|
|
|
public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform";
|
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
private final AggregationCounter aggregationCounter;
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
private final String transformationRule;
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
private final Cleaner cleanFunction;
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
private final long dateOfTransformation;
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
public XSLTTransformationFunction(
|
|
|
|
final AggregationCounter aggregationCounter,
|
|
|
|
final String transformationRule,
|
|
|
|
long dateOfTransformation,
|
|
|
|
final VocabularyGroup vocabularies)
|
|
|
|
throws Exception {
|
|
|
|
this.aggregationCounter = aggregationCounter;
|
|
|
|
this.transformationRule = transformationRule;
|
|
|
|
this.dateOfTransformation = dateOfTransformation;
|
|
|
|
cleanFunction = new Cleaner(vocabularies);
|
|
|
|
}
|
2021-01-27 15:43:08 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
@Override
|
|
|
|
public MetadataRecord call(MetadataRecord value) {
|
|
|
|
aggregationCounter.getTotalItems().add(1);
|
|
|
|
try {
|
|
|
|
Processor processor = new Processor(false);
|
|
|
|
processor.registerExtensionFunction(cleanFunction);
|
2021-02-12 16:34:52 +01:00
|
|
|
processor.registerExtensionFunction(new DateCleaner());
|
2021-04-09 14:35:30 +02:00
|
|
|
processor.registerExtensionFunction(new PersonCleaner());
|
2021-03-03 10:17:16 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
final XsltCompiler comp = processor.newXsltCompiler();
|
|
|
|
XsltExecutable xslt = comp
|
2021-03-03 10:17:16 +01:00
|
|
|
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)));
|
2021-01-28 09:51:17 +01:00
|
|
|
XdmNode source = processor
|
|
|
|
.newDocumentBuilder()
|
2021-03-03 10:17:16 +01:00
|
|
|
.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
|
2021-01-28 09:51:17 +01:00
|
|
|
XsltTransformer trans = xslt.load();
|
|
|
|
trans.setInitialContextNode(source);
|
|
|
|
final StringWriter output = new StringWriter();
|
|
|
|
Serializer out = processor.newSerializer(output);
|
|
|
|
out.setOutputProperty(Serializer.Property.METHOD, "xml");
|
|
|
|
out.setOutputProperty(Serializer.Property.INDENT, "yes");
|
2021-03-03 10:17:16 +01:00
|
|
|
|
2021-01-28 09:51:17 +01:00
|
|
|
trans.setDestination(out);
|
|
|
|
trans.transform();
|
|
|
|
final String xml = output.toString();
|
|
|
|
value.setBody(xml);
|
|
|
|
value.setDateOfTransformation(dateOfTransformation);
|
|
|
|
aggregationCounter.getProcessedItems().add(1);
|
|
|
|
return value;
|
|
|
|
} catch (Throwable e) {
|
|
|
|
aggregationCounter.getErrorItems().add(1);
|
2021-02-12 17:27:08 +01:00
|
|
|
throw new RuntimeException(e);
|
2021-01-28 09:51:17 +01:00
|
|
|
}
|
|
|
|
}
|
2021-01-27 15:43:08 +01:00
|
|
|
}
|