209 lines
6.4 KiB
Java
209 lines
6.4 KiB
Java
|
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||
|
|
||
|
import org.apache.commons.io.IOUtils;
|
||
|
import org.apache.commons.lang3.StringUtils;
|
||
|
import org.apache.commons.logging.Log;
|
||
|
import org.apache.commons.logging.LogFactory;
|
||
|
import org.dom4j.DocumentException;
|
||
|
import org.dom4j.io.SAXReader;
|
||
|
import org.w3c.dom.Document;
|
||
|
import org.w3c.dom.NodeList;
|
||
|
import org.xml.sax.InputSource;
|
||
|
|
||
|
import javax.xml.parsers.DocumentBuilder;
|
||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||
|
import javax.xml.xpath.XPath;
|
||
|
import javax.xml.xpath.XPathConstants;
|
||
|
import javax.xml.xpath.XPathExpression;
|
||
|
import javax.xml.xpath.XPathFactory;
|
||
|
import java.io.*;
|
||
|
import java.net.URL;
|
||
|
import java.nio.charset.Charset;
|
||
|
import java.nio.charset.UnsupportedCharsetException;
|
||
|
import java.util.ArrayList;
|
||
|
import java.util.EnumSet;
|
||
|
import java.util.HashMap;
|
||
|
import java.util.List;
|
||
|
import java.util.zip.GZIPInputStream;
|
||
|
|
||
|
public class Utils {
|
||
|
private static final Log log = LogFactory.getLog(Utils.class);
|
||
|
|
||
|
public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
|
||
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||
|
DocumentBuilder builder = factory.newDocumentBuilder();
|
||
|
Document doc = builder.parse(new InputSource(new StringReader(xml)));
|
||
|
return Utils.collectAsStrings(doc, xpath);
|
||
|
}
|
||
|
|
||
|
public static List<String> collectAsStrings(File file, String xpath) throws Exception{
|
||
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||
|
DocumentBuilder builder = factory.newDocumentBuilder();
|
||
|
Document doc = builder.parse(file);
|
||
|
return Utils.collectAsStrings(doc, xpath);
|
||
|
}
|
||
|
|
||
|
public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
|
||
|
XPathFactory xPathfactory = XPathFactory.newInstance();
|
||
|
XPath path = xPathfactory.newXPath();
|
||
|
XPathExpression expr = path.compile(xpath);
|
||
|
NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
|
||
|
|
||
|
List<String> values = new ArrayList<>();
|
||
|
|
||
|
for (int i = 0; i < nodes.getLength(); i++)
|
||
|
values.add(nodes.item(i).getNodeValue());
|
||
|
|
||
|
return values;
|
||
|
}
|
||
|
|
||
|
public static void decompressGZipTo(File input, File output) throws Exception {
|
||
|
try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
|
||
|
try (FileOutputStream out = new FileOutputStream(output)){
|
||
|
byte[] buffer = new byte[1024];
|
||
|
int len;
|
||
|
while((len = in.read(buffer)) != -1){
|
||
|
out.write(buffer, 0, len);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
|
||
|
{
|
||
|
String value = map.get(key);
|
||
|
if(value == null) return defaultValue;
|
||
|
return value;
|
||
|
}
|
||
|
|
||
|
public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
|
||
|
{
|
||
|
String value = map.get(key);
|
||
|
if(value == null) return defaultValue;
|
||
|
String[] splits = value.split(",");
|
||
|
List<String> curated = new ArrayList<>();
|
||
|
for(String item : splits){
|
||
|
if(item == null || item.trim().length() == 0) continue;
|
||
|
curated.add(item.trim());
|
||
|
}
|
||
|
return curated;
|
||
|
}
|
||
|
|
||
|
public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
|
||
|
{
|
||
|
String value = map.get(key);
|
||
|
if(value == null) return defaultValue;
|
||
|
try {
|
||
|
return Integer.parseInt(value);
|
||
|
} catch (NumberFormatException e) {
|
||
|
return defaultValue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
|
||
|
{
|
||
|
String value = map.get(key);
|
||
|
if(value == null) return defaultValue;
|
||
|
try {
|
||
|
return Long.parseLong(value);
|
||
|
} catch (NumberFormatException e) {
|
||
|
return defaultValue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
|
||
|
//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
|
||
|
EnumSet<E> values = EnumSet.allOf(clazz);
|
||
|
String value = map.get(key);
|
||
|
if (value == null) return defaultValue;
|
||
|
for(E val : values){
|
||
|
if(!val.name().equalsIgnoreCase(value)) continue;
|
||
|
return val;
|
||
|
}
|
||
|
return defaultValue;
|
||
|
}
|
||
|
|
||
|
public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
|
||
|
String value = map.get(key);
|
||
|
if (value == null) return defaultValue;
|
||
|
return Boolean.parseBoolean(value);
|
||
|
}
|
||
|
|
||
|
public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
|
||
|
{
|
||
|
String value = map.get(key);
|
||
|
if(value == null) return defaultValue;
|
||
|
try {
|
||
|
return Charset.forName(value);
|
||
|
} catch (UnsupportedCharsetException e) {
|
||
|
return defaultValue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException {
|
||
|
int retry =0;
|
||
|
while(retry < retryCount) {
|
||
|
try {
|
||
|
return IOUtils.toString(endpoint, charset);
|
||
|
} catch (Exception ex) {
|
||
|
retry += 1;
|
||
|
if (retry < retryCount) {
|
||
|
log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds");
|
||
|
try {
|
||
|
Thread.sleep(waitBetweenRetriesMillis);
|
||
|
} catch (Exception e) {
|
||
|
}
|
||
|
}
|
||
|
else{
|
||
|
log.debug("problem accessing url " + endpoint + ". throwing");
|
||
|
throw ex;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
public static Boolean validateXml(String xml){
|
||
|
try {
|
||
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||
|
DocumentBuilder builder = factory.newDocumentBuilder();
|
||
|
InputSource is = new InputSource(new StringReader(xml));
|
||
|
builder.parse(is);
|
||
|
return true;
|
||
|
}catch(Exception ex){
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
|
||
|
|
||
|
int skipped = 0;
|
||
|
int count = 0;
|
||
|
|
||
|
for(String item : iterable) {
|
||
|
|
||
|
final org.dom4j.Document doc = new SAXReader().read(new StringReader(item));
|
||
|
|
||
|
if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) {
|
||
|
log.info(item);
|
||
|
String fileName = outDir + "/" + count++;
|
||
|
|
||
|
try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) {
|
||
|
w.write(item);
|
||
|
}
|
||
|
log.info("wrote " + fileName);
|
||
|
} else {
|
||
|
skipped++;
|
||
|
}
|
||
|
if (skipped % 100 == 0) {
|
||
|
log.info("skipped so far " + skipped);
|
||
|
}
|
||
|
if (count % 100 == 0) {
|
||
|
log.info("stored so far " + count);
|
||
|
}
|
||
|
}
|
||
|
log.info(String.format("Done! skipped %s, stored %s", skipped, count));
|
||
|
}
|
||
|
}
|