2020-03-27 10:42:17 +01:00
|
|
|
package eu.dnetlib.dhp.oa.provision.utils;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
import com.google.common.collect.Lists;
|
2020-02-13 16:53:27 +01:00
|
|
|
import java.io.StringReader;
|
|
|
|
import java.io.StringWriter;
|
|
|
|
import java.util.Arrays;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.List;
|
|
|
|
import javax.xml.stream.*;
|
|
|
|
import javax.xml.stream.events.Namespace;
|
|
|
|
import javax.xml.stream.events.StartElement;
|
|
|
|
import javax.xml.stream.events.XMLEvent;
|
|
|
|
import org.apache.solr.common.SolrInputDocument;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Optimized version of the document parser, drop in replacement of InputDocumentFactory.
|
|
|
|
*
|
2020-04-18 12:42:58 +02:00
|
|
|
* <p>Faster because:
|
|
|
|
*
|
2020-02-13 16:53:27 +01:00
|
|
|
* <ul>
|
2020-04-18 12:42:58 +02:00
|
|
|
* <li>Doesn't create a DOM for the full document
|
|
|
|
* <li>Doesn't execute xpaths agains the DOM
|
|
|
|
* <li>Quickly serialize the 'result' element directly in a string.
|
|
|
|
* <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel
|
2020-02-13 16:53:27 +01:00
|
|
|
* </ul>
|
|
|
|
*
|
2020-04-18 12:42:58 +02:00
|
|
|
* <p>This class is fully reentrant and can be invoked in parallel.
|
2020-02-13 16:53:27 +01:00
|
|
|
*
|
|
|
|
* @author claudio
|
|
|
|
*/
|
|
|
|
public class StreamingInputDocumentFactory {
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String INDEX_FIELD_PREFIX = "__";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String RESULT = "result";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final List<String> dateFormats =
|
|
|
|
Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String DEFAULTDNETRESULT = "dnetResult";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String TARGETFIELDS = "targetFields";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final String ROOT_ELEMENT = "indexRecord";
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private static final int MAX_FIELD_LENGTH = 25000;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private ThreadLocal<XMLInputFactory> inputFactory =
|
|
|
|
ThreadLocal.withInitial(() -> XMLInputFactory.newInstance());
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private ThreadLocal<XMLOutputFactory> outputFactory =
|
|
|
|
ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance());
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private ThreadLocal<XMLEventFactory> eventFactory =
|
|
|
|
ThreadLocal.withInitial(() -> XMLEventFactory.newInstance());
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private String version;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private String dsId;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
private String resultName = DEFAULTDNETRESULT;
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
public StreamingInputDocumentFactory(final String version, final String dsId) {
|
|
|
|
this(version, dsId, DEFAULTDNETRESULT);
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
public StreamingInputDocumentFactory(
|
|
|
|
final String version, final String dsId, final String resultName) {
|
|
|
|
this.version = version;
|
|
|
|
this.dsId = dsId;
|
|
|
|
this.resultName = resultName;
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
public SolrInputDocument parseDocument(final String inputDocument) {
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final StringWriter results = new StringWriter();
|
|
|
|
final List<Namespace> nsList = Lists.newLinkedList();
|
|
|
|
try {
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
XMLEventReader parser =
|
|
|
|
inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
while (parser.hasNext()) {
|
|
|
|
final XMLEvent event = parser.nextEvent();
|
|
|
|
if ((event != null) && event.isStartElement()) {
|
|
|
|
final String localName = event.asStartElement().getName().getLocalPart();
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (ROOT_ELEMENT.equals(localName)) {
|
|
|
|
nsList.addAll(getNamespaces(event));
|
|
|
|
} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
|
|
|
|
final XMLEvent text = parser.nextEvent();
|
|
|
|
String recordId = getText(text);
|
|
|
|
indexDocument.addField(INDEX_RECORD_ID, recordId);
|
|
|
|
} else if (TARGETFIELDS.equals(localName)) {
|
|
|
|
parseTargetFields(indexDocument, parser);
|
|
|
|
} else if (resultName.equals(localName)) {
|
|
|
|
copyResult(indexDocument, results, parser, nsList, resultName);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (version != null) {
|
|
|
|
indexDocument.addField(DS_VERSION, version);
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (dsId != null) {
|
|
|
|
indexDocument.addField(DS_ID, dsId);
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
|
|
|
|
indexDocument.clear();
|
|
|
|
System.err.println("missing indexrecord id:\n" + inputDocument);
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
return indexDocument;
|
|
|
|
} catch (XMLStreamException e) {
|
|
|
|
return new SolrInputDocument();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private List<Namespace> getNamespaces(final XMLEvent event) {
|
|
|
|
final List<Namespace> res = Lists.newLinkedList();
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
|
|
|
|
while (nsIter.hasNext()) {
|
|
|
|
Namespace ns = nsIter.next();
|
|
|
|
res.add(ns);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse the targetFields block and add fields to the solr document.
|
|
|
|
*
|
|
|
|
* @param indexDocument
|
|
|
|
* @param parser
|
|
|
|
* @throws XMLStreamException
|
|
|
|
*/
|
|
|
|
protected void parseTargetFields(
|
|
|
|
final SolrInputDocument indexDocument, final XMLEventReader parser)
|
|
|
|
throws XMLStreamException {
|
|
|
|
|
|
|
|
boolean hasFields = false;
|
|
|
|
|
|
|
|
while (parser.hasNext()) {
|
|
|
|
final XMLEvent targetEvent = parser.nextEvent();
|
|
|
|
if (targetEvent.isEndElement()
|
|
|
|
&& targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (targetEvent.isStartElement()) {
|
|
|
|
final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
|
|
|
|
final XMLEvent text = parser.nextEvent();
|
|
|
|
|
|
|
|
String data = getText(text);
|
|
|
|
|
|
|
|
addField(indexDocument, fieldName, data);
|
|
|
|
hasFields = true;
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
}
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (!hasFields) {
|
|
|
|
indexDocument.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copy the /indexRecord/result element and children, preserving namespace declarations etc.
|
|
|
|
*
|
|
|
|
* @param indexDocument
|
|
|
|
* @param results
|
|
|
|
* @param parser
|
|
|
|
* @param nsList
|
|
|
|
* @throws XMLStreamException
|
|
|
|
*/
|
|
|
|
protected void copyResult(
|
|
|
|
final SolrInputDocument indexDocument,
|
|
|
|
final StringWriter results,
|
|
|
|
final XMLEventReader parser,
|
|
|
|
final List<Namespace> nsList,
|
|
|
|
final String dnetResult)
|
|
|
|
throws XMLStreamException {
|
|
|
|
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
|
|
|
|
|
|
|
|
for (Namespace ns : nsList) {
|
|
|
|
eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
|
2020-02-13 16:53:27 +01:00
|
|
|
}
|
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
StartElement newRecord =
|
|
|
|
eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
|
|
|
|
|
|
|
|
// new root record
|
|
|
|
writer.add(newRecord);
|
2020-02-13 16:53:27 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
// copy the rest as it is
|
|
|
|
while (parser.hasNext()) {
|
|
|
|
final XMLEvent resultEvent = parser.nextEvent();
|
|
|
|
|
|
|
|
// TODO: replace with depth tracking instead of close tag tracking.
|
|
|
|
if (resultEvent.isEndElement()
|
|
|
|
&& resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
|
|
|
|
writer.add(eventFactory.get().createEndElement("", null, RESULT));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
writer.add(resultEvent);
|
|
|
|
}
|
|
|
|
writer.close();
|
|
|
|
indexDocument.addField(INDEX_RESULT, results.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Helper used to add a field to a solr doc. It avoids to add empy fields
|
|
|
|
*
|
|
|
|
* @param indexDocument
|
|
|
|
* @param field
|
|
|
|
* @param value
|
|
|
|
*/
|
|
|
|
private final void addField(
|
|
|
|
final SolrInputDocument indexDocument, final String field, final String value) {
|
|
|
|
String cleaned = value.trim();
|
|
|
|
if (!cleaned.isEmpty()) {
|
|
|
|
// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
|
|
|
|
indexDocument.addField(field.toLowerCase(), cleaned);
|
2020-02-13 16:53:27 +01:00
|
|
|
}
|
2020-04-27 14:45:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Helper used to get the string from a text element.
|
|
|
|
*
|
|
|
|
* @param text
|
|
|
|
* @return the
|
|
|
|
*/
|
|
|
|
protected final String getText(final XMLEvent text) {
|
|
|
|
if (text.isEndElement()) // log.warn("skipping because isEndOfElement " +
|
|
|
|
// text.asEndElement().getName().getLocalPart());
|
|
|
|
return "";
|
|
|
|
|
|
|
|
final String data = text.asCharacters().getData();
|
|
|
|
if (data != null && data.length() > MAX_FIELD_LENGTH) {
|
|
|
|
return data.substring(0, MAX_FIELD_LENGTH);
|
|
|
|
}
|
|
|
|
|
|
|
|
return data;
|
|
|
|
}
|
2020-02-13 16:53:27 +01:00
|
|
|
}
|