dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/projects/gtr2/Gtr2Helper.java

182 lines
5.1 KiB
Java

package eu.dnetlib.data.collector.plugins.projects.gtr2;
import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
import java.util.concurrent.*;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.lang3.*;
public class Gtr2Helper {
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
private VTDNav mainVTDNav;
private AutoPilot mainAutoPilot;
private StringWriter writer;
private HttpConnector connector;
//private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
public String processProject(final VTDNav vn, final String namespaces) throws Exception {
//log.debug("Processing project at "+projectURL);
writer = new StringWriter();
mainVTDNav = vn;
mainAutoPilot = new AutoPilot(mainVTDNav);
writer.write("<doc " + namespaces + ">");
writeFragment(mainVTDNav);
mainAutoPilot.selectXPath("//link[@rel='FUND']");
ExecutorService es = Executors.newFixedThreadPool(5);
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new ProcessFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
new String[] { "<ld-org>", "</ld-org>" }));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
new String[] { "<pp-org>","</pp-org>" }));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new PiPer(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
es.execute(t);
}
es.shutdown();
log.debug("Waiting threads");
es.awaitTermination(10, TimeUnit.MINUTES);
log.debug("Finished writing project");
writer.write("</doc>");
writer.close();
return writer.toString();
}
private VTDNav setNavigator(final String httpUrl) {
VTDGen vg_tmp = new VTDGen();
connector = new HttpConnector();
try {
byte[] bytes = connector.getInputSource(httpUrl).getBytes("UTF-8");
vg_tmp.setDoc(bytes);
vg_tmp.parse(false);
//vg_tmp.parseHttpUrl(httpUrl, false);
return vg_tmp.getNav();
}catch (Throwable e){
return null;
}
}
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
ap_tmp.selectXPath(xPath);
return ap_tmp.evalXPath();
}
private void writeFragment(final VTDNav nav) throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
nav.dumpFragment(b);
String ret = b.toString();
b.reset();
writer.write(ret);
}
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) throws Exception {
int nav_res = evalXpath(vn, xPath);
if (nav_res != -1) {
String tmp = xmlOpenTag;
if (attrName != null) tmp += (vn.toNormalizedString(vn.getAttrVal(attrName)));
else
tmp += (StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText())));
tmp += (xmlCloseTag);
writer.write(tmp);
}
}
private class PiPer implements Runnable {
private VTDNav vn;
public PiPer(String httpURL) {
vn = setNavigator(httpURL);
}
@Override
public void run() {
try {
writeFragment(vn);
} catch (Throwable e) {log.debug("Eccezione in PiPer " + e.getMessage());}
}
}
private class Org implements Runnable {
private String[] tags;
private VTDNav vn;
public Org(final String httpURL, final String[] tags) {
vn = setNavigator(httpURL);
this.tags = tags;
}
@Override
public void run() {
try {
writeNewTagAndInfo(vn, "//name", tags[0]+"<name>", "</name>", null);
vn.toElement(VTDNav.ROOT);
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
vn.toElement(VTDNav.ROOT);
writeNewTagAndInfo(vn, ".", "<id>", "</id>"+tags[1], "id");
} catch (Throwable e) {
log.debug("Eccezione in Org " + e.getMessage());
}
}
}
private class ProcessFunder implements Runnable {
private VTDNav vn;
public ProcessFunder(final String httpURL) {
vn = setNavigator(httpURL);
}
@Override
public void run() {
try {
AutoPilot ap = new AutoPilot(vn);
writeFragment(vn);
ap.selectXPath(".//link[@rel='FUNDER']");
VTDNav tmp_vn;
while (ap.evalXPath() != -1) {
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
}
} catch (Throwable e) {log.debug("Eccezione in Funder" + e.getMessage());}
}
}
}