182 lines
5.1 KiB
Java
182 lines
5.1 KiB
Java
package eu.dnetlib.data.collector.plugins.projects.gtr2;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.StringWriter;
|
|
import java.util.concurrent.*;
|
|
|
|
import com.ximpleware.AutoPilot;
|
|
import com.ximpleware.VTDGen;
|
|
import com.ximpleware.VTDNav;
|
|
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.commons.lang3.*;
|
|
|
|
public class Gtr2Helper {
|
|
|
|
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
|
|
|
|
private VTDNav mainVTDNav;
|
|
private AutoPilot mainAutoPilot;
|
|
private StringWriter writer;
|
|
private HttpConnector connector;
|
|
//private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
|
|
|
|
public String processProject(final VTDNav vn, final String namespaces) throws Exception {
|
|
//log.debug("Processing project at "+projectURL);
|
|
writer = new StringWriter();
|
|
mainVTDNav = vn;
|
|
mainAutoPilot = new AutoPilot(mainVTDNav);
|
|
writer.write("<doc " + namespaces + ">");
|
|
writeFragment(mainVTDNav);
|
|
|
|
mainAutoPilot.selectXPath("//link[@rel='FUND']");
|
|
ExecutorService es = Executors.newFixedThreadPool(5);
|
|
|
|
while (mainAutoPilot.evalXPath() != -1) {
|
|
Thread t = new Thread(new ProcessFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
|
es.execute(t);
|
|
}
|
|
|
|
mainAutoPilot.resetXPath();
|
|
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
|
|
while (mainAutoPilot.evalXPath() != -1) {
|
|
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
|
new String[] { "<ld-org>", "</ld-org>" }));
|
|
es.execute(t);
|
|
}
|
|
mainAutoPilot.resetXPath();
|
|
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
|
|
while (mainAutoPilot.evalXPath() != -1) {
|
|
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
|
new String[] { "<pp-org>","</pp-org>" }));
|
|
es.execute(t);
|
|
}
|
|
mainAutoPilot.resetXPath();
|
|
|
|
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
|
|
while (mainAutoPilot.evalXPath() != -1) {
|
|
Thread t = new Thread(new PiPer(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
|
es.execute(t);
|
|
}
|
|
es.shutdown();
|
|
log.debug("Waiting threads");
|
|
es.awaitTermination(10, TimeUnit.MINUTES);
|
|
|
|
log.debug("Finished writing project");
|
|
writer.write("</doc>");
|
|
writer.close();
|
|
|
|
return writer.toString();
|
|
}
|
|
|
|
private VTDNav setNavigator(final String httpUrl) {
|
|
VTDGen vg_tmp = new VTDGen();
|
|
connector = new HttpConnector();
|
|
try {
|
|
byte[] bytes = connector.getInputSource(httpUrl).getBytes("UTF-8");
|
|
vg_tmp.setDoc(bytes);
|
|
vg_tmp.parse(false);
|
|
//vg_tmp.parseHttpUrl(httpUrl, false);
|
|
return vg_tmp.getNav();
|
|
}catch (Throwable e){
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
|
|
|
|
AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
|
|
ap_tmp.selectXPath(xPath);
|
|
return ap_tmp.evalXPath();
|
|
}
|
|
|
|
private void writeFragment(final VTDNav nav) throws Exception {
|
|
ByteArrayOutputStream b = new ByteArrayOutputStream();
|
|
nav.dumpFragment(b);
|
|
String ret = b.toString();
|
|
b.reset();
|
|
writer.write(ret);
|
|
}
|
|
|
|
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) throws Exception {
|
|
|
|
int nav_res = evalXpath(vn, xPath);
|
|
if (nav_res != -1) {
|
|
String tmp = xmlOpenTag;
|
|
if (attrName != null) tmp += (vn.toNormalizedString(vn.getAttrVal(attrName)));
|
|
else
|
|
tmp += (StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText())));
|
|
tmp += (xmlCloseTag);
|
|
writer.write(tmp);
|
|
}
|
|
}
|
|
|
|
private class PiPer implements Runnable {
|
|
|
|
private VTDNav vn;
|
|
|
|
public PiPer(String httpURL) {
|
|
vn = setNavigator(httpURL);
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
try {
|
|
writeFragment(vn);
|
|
} catch (Throwable e) {log.debug("Eccezione in PiPer " + e.getMessage());}
|
|
|
|
}
|
|
}
|
|
|
|
private class Org implements Runnable {
|
|
|
|
private String[] tags;
|
|
private VTDNav vn;
|
|
|
|
public Org(final String httpURL, final String[] tags) {
|
|
vn = setNavigator(httpURL);
|
|
this.tags = tags;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
try {
|
|
writeNewTagAndInfo(vn, "//name", tags[0]+"<name>", "</name>", null);
|
|
vn.toElement(VTDNav.ROOT);
|
|
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
|
|
vn.toElement(VTDNav.ROOT);
|
|
writeNewTagAndInfo(vn, ".", "<id>", "</id>"+tags[1], "id");
|
|
} catch (Throwable e) {
|
|
log.debug("Eccezione in Org " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
private class ProcessFunder implements Runnable {
|
|
|
|
private VTDNav vn;
|
|
|
|
public ProcessFunder(final String httpURL) {
|
|
vn = setNavigator(httpURL);
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
|
|
try {
|
|
AutoPilot ap = new AutoPilot(vn);
|
|
writeFragment(vn);
|
|
ap.selectXPath(".//link[@rel='FUNDER']");
|
|
VTDNav tmp_vn;
|
|
while (ap.evalXPath() != -1) {
|
|
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
|
|
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
|
|
}
|
|
} catch (Throwable e) {log.debug("Eccezione in Funder" + e.getMessage());}
|
|
}
|
|
|
|
}
|
|
}
|