package eu.dnetlib.data.collector.plugins.schemaorg; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathFactory; import java.io.*; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.zip.GZIPInputStream; public class Utils { private static final Log log = LogFactory.getLog(Utils.class); public static List collectAsStrings(String xml, String xpath) throws Exception{ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(xml))); return Utils.collectAsStrings(doc, xpath); } public static List collectAsStrings(File file, String xpath) throws Exception{ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(file); return Utils.collectAsStrings(doc, xpath); } public static List collectAsStrings(Document doc, String xpath) throws Exception{ XPathFactory xPathfactory = XPathFactory.newInstance(); XPath path = xPathfactory.newXPath(); XPathExpression expr = path.compile(xpath); NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET); List values = new ArrayList<>(); for (int i = 0; i < nodes.getLength(); i++) values.add(nodes.item(i).getNodeValue()); return values; } public static void decompressGZipTo(File input, File output) throws Exception { try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){ try (FileOutputStream out = new FileOutputStream(output)){ byte[] buffer = new byte[1024]; int len; while((len = in.read(buffer)) != -1){ out.write(buffer, 0, len); } } } } public static String getAsString(HashMap map, String key, String defaultValue) { String value = map.get(key); if(value == null) return defaultValue; return value; } public static List getAsStringCsv(HashMap map, String key, List defaultValue) { String value = map.get(key); if(value == null) return defaultValue; String[] splits = value.split(","); List curated = new ArrayList<>(); for(String item : splits){ if(item == null || item.trim().length() == 0) continue; curated.add(item.trim()); } return curated; } public static int getAsInt(HashMap map, String key, int defaultValue) { String value = map.get(key); if(value == null) return defaultValue; try { return Integer.parseInt(value); } catch (NumberFormatException e) { return defaultValue; } } public static long getAsLong(HashMap map, String key, long defaultValue) { String value = map.get(key); if(value == null) return defaultValue; try { return Long.parseLong(value); } catch (NumberFormatException e) { return defaultValue; } } public static > E getAsEnum(HashMap map, String key, E defaultValue, Class clazz) { //EnumSet values = EnumSet.allOf(defaultValue.getClass()); EnumSet values = EnumSet.allOf(clazz); String value = map.get(key); if (value == null) return defaultValue; for(E val : values){ if(!val.name().equalsIgnoreCase(value)) continue; return val; } return defaultValue; } public static Boolean getAsBoolean(HashMap map, String key, Boolean defaultValue) { String value = map.get(key); if (value == null) return defaultValue; return Boolean.parseBoolean(value); } public static Charset getAsCharset(HashMap map, String key, Charset defaultValue) { String value = map.get(key); if(value == null) return defaultValue; try { return Charset.forName(value); } catch (UnsupportedCharsetException e) { return defaultValue; } } public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException { int retry =0; while(retry < retryCount) { try { return IOUtils.toString(endpoint, charset); } catch (Exception ex) { retry += 1; if (retry < retryCount) { log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds"); try { Thread.sleep(waitBetweenRetriesMillis); } catch (Exception e) { } } else{ log.debug("problem accessing url " + endpoint + ". throwing"); throw ex; } } } return null; } public static Boolean validateXml(String xml){ try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); InputSource is = new InputSource(new StringReader(xml)); builder.parse(is); return true; }catch(Exception ex){ return false; } } public static void writeFiles(final Iterable iterable, final String outDir) throws DocumentException, IOException { int skipped = 0; int count = 0; for(String item : iterable) { final org.dom4j.Document doc = new SAXReader().read(new StringReader(item)); if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) { log.info(item); String fileName = outDir + "/" + count++; try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) { w.write(item); } log.info("wrote " + fileName); } else { skipped++; } if (skipped % 100 == 0) { log.info("skipped so far " + skipped); } if (count % 100 == 0) { log.info("stored so far " + count); } } log.info(String.format("Done! skipped %s, stored %s", skipped, count)); } }