package eu.dnetlib.data.mdstore.modular.mongodb.utils; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; import eu.dnetlib.data.mdstore.modular.MDFormatDescription; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Created by sandro on 11/29/16. */ public class IndexFieldRecordParser { private static final Log log = LogFactory.getLog(IndexFieldRecordParser.class); private static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws Exception { List results = new ArrayList<>(); ap.selectXPath(xpath); while (ap.evalXPath() != -1) { int t = vn.getText(); if (t > -1) results.add(vn.toNormalizedString(t)); } return results; } public Map> parseRecord(final String record, final List mdformats) throws IndexFieldRecordParserException { if (mdformats == null || mdformats.size() == 0) return null; final Map> result = new HashMap<>(); try { final VTDGen vg = new VTDGen(); vg.setDoc(record.getBytes()); vg.parse(true); final VTDNav vn = vg.getNav(); final AutoPilot ap = new AutoPilot(vn); for (MDFormatDescription description : mdformats) { List xpathResult = getTextValue(ap, vn, description.getXpath()); result.put(description.getName(), xpathResult); } return result; } catch (Throwable e) { throw new IndexFieldRecordParserException("Cannot index record", e); } } }