dnet-hadoop/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction....

128 lines
3.9 KiB
Java

package eu.dnetlib.dhp.transformation.xslt;
import static eu.dnetlib.dhp.common.Constants.*;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import javax.xml.transform.stream.StreamSource;
import org.apache.avro.test.specialtypes.value;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.spark.api.java.function.MapFunction;
import eu.dnetlib.dhp.aggregation.common.AggregationCounter;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord;
import net.sf.saxon.s9api.*;
public class XSLTTransformationFunction implements MapFunction<MetadataRecord, MetadataRecord>, Serializable {
public static final String QNAME_BASE_URI = "http://eu/dnetlib/transform";
private static final String DATASOURCE_ID_PARAM = "varDataSourceId";
private static final String DATASOURCE_NAME_PARAM = "varOfficialName";
private final AggregationCounter aggregationCounter;
private final AggregatorReport report;
private final String transformationRule;
private final long dateOfTransformation;
private final VocabularyGroup vocabularies;
public XSLTTransformationFunction(
final AggregationCounter aggregationCounter,
final AggregatorReport report,
final String transformationRule,
long dateOfTransformation,
final VocabularyGroup vocabularies) {
this.aggregationCounter = aggregationCounter;
this.report = report;
this.transformationRule = transformationRule;
this.vocabularies = vocabularies;
this.dateOfTransformation = dateOfTransformation;
}
@Override
public MetadataRecord call(MetadataRecord value) {
aggregationCounter.getTotalItems().add(1);
final Processor xsltProcessor = new Processor(false);
xsltProcessor.registerExtensionFunction(new Cleaner(vocabularies));
xsltProcessor.registerExtensionFunction(new DateCleaner());
xsltProcessor.registerExtensionFunction(new PersonCleaner());
final StringWriter output = new StringWriter();
final Serializer out = xsltProcessor.newSerializer(output);
out.setOutputProperty(Serializer.Property.METHOD, "xml");
out.setOutputProperty(Serializer.Property.INDENT, "yes");
XsltTransformer transformer;
try {
transformer = xsltProcessor
.newXsltCompiler()
.compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8)))
.load();
} catch (SaxonApiException e) {
report.put(e.getClass().getName(), e.getMessage());
try {
report.close();
} catch (IOException ex) {
throw new IllegalArgumentException("error compiling the XSLT", e);
}
throw new IllegalArgumentException("error compiling the XSLT", e);
}
transformer
.setParameter(new QName(DATASOURCE_ID_PARAM), new XdmAtomicValue(value.getProvenance().getDatasourceId()));
transformer
.setParameter(
new QName(DATASOURCE_NAME_PARAM), new XdmAtomicValue(value.getProvenance().getDatasourceName()));
try {
final XdmNode source = xsltProcessor
.newDocumentBuilder()
.build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8)));
transformer.setInitialContextNode(source);
transformer.setDestination(out);
transformer.transform();
} catch (SaxonApiException e) {
report.put(e.getClass().getName(), e.getMessage());
aggregationCounter.getErrorItems().add(1);
return null;
}
final String xml = output.toString();
value.setBody(xml);
value.setDateOfTransformation(dateOfTransformation);
aggregationCounter.getProcessedItems().add(1);
return value;
}
public AggregationCounter getAggregationCounter() {
return aggregationCounter;
}
public String getTransformationRule() {
return transformationRule;
}
public long getDateOfTransformation() {
return dateOfTransformation;
}
public VocabularyGroup getVocabularies() {
return vocabularies;
}
}