package eu.dnetlib.data.collector.plugins.httpfilename; import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; import eu.dnetlib.data.collector.rmi.CollectorServiceException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.json.JSONObject; import org.json.XML; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * Created by miriam on 04/05/2018. */ public class HTTPWithFileNameCollectorIterable implements Iterable { private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class); private static final String JUNK = "%sJUNK"; public static final String APP_JSON = "application/json"; public static final String APP_XML = "application/xml"; public static final String TEXT_HTML = "text/html"; private final ArrayBlockingQueue queue = new ArrayBlockingQueue(100); private String filterParam; int total = 0; int filtered = 0; public HTTPWithFileNameCollectorIterable(String startUrl, String filter){ this.filterParam = filter; Thread ft = new Thread(new FillMetaQueue(startUrl) ); ft.start(); } @Override public Iterator iterator() { return new HttpWithFileNameCollectorIterator(queue); } private class FillMetaQueue implements Runnable { final Connector c = new Connector(); private final List metas = Collections.synchronizedList(new ArrayList()); private final List urls = Collections.synchronizedList(new ArrayList<>()); public FillMetaQueue(String startUrl){ if(!startUrl.isEmpty()){ urls.add(startUrl); } } public void fillQueue() { String url; while((metas.size()>0 || urls.size() > 0 )) { log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size()); if (metas.size() > 0) { url = metas.remove(0); try { c.get(url); } catch (CollectorServiceException e) { log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); } if(c.isStatusOk()){ try { String ret = c.getResponse(); if (ret != null && ret.length()>0) { if (!containsFilter(ret)) queue.put(addFilePath(ret, url, url.endsWith(".json"))); //queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); else filtered++; total++; } } catch (InterruptedException e) { log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); } } } else { url = urls.remove(0); try { c.get(url); } catch (CollectorServiceException e) { log.info("Impossible to collect url: " + url + " error: " + e.getMessage()); } if(c.isStatusOk()) { if (c.responseTypeContains(TEXT_HTML)){ recurFolder(c.getResponse(), url); } else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){ try { final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON)); //queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); queue.put(element); } catch (InterruptedException e) { log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() ); } } } } } try { //queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS); queue.put(HttpWithFileNameCollectorIterator.TERMINATOR); } catch (InterruptedException e) { throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e); } } private boolean containsFilter(String meta){ if (filterParam == null || filterParam.isEmpty()) return false; String[] filter = filterParam.split(";"); for(String item:filter){ if (meta.contains(item)) return true; } return false; } private String addFilePath(String meta, String url, boolean isJson){ String path = url.replace("metadata", "pdf"); try { if(isJson) meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}"; else { if (meta.contains("") + 1); } int index = meta.lastIndexOf("" + path.substring(0, path.indexOf(".xml")) + ".pdf" + meta.substring(index); } } catch(Exception ex) { log.info("not file with extension .json or .xml"); } if(isJson) { try { return XML.toString(new JSONObject("{'resource':" + meta + "}")); } catch(Exception e) { log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url); // throw new RuntimeException(); final String junk = String.format(JUNK, url); log.warn("returning " + junk); return junk; } } return meta; } private void recurFolder(String text, String url){ Document doc = Jsoup.parse(text); Elements links = doc.select("a"); for(Element e:links){ if (!e.text().equals("../")){ String file = e.attr("href"); if(file.endsWith(".json") || file.endsWith(".xml")) metas.add(url+file); else urls.add(url+file); } } } @Override public void run() { fillQueue(); } } }