dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/Utils.java

209 lines
6.4 KiB
Java

package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.zip.GZIPInputStream;
public class Utils {
private static final Log log = LogFactory.getLog(Utils.class);
public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new InputSource(new StringReader(xml)));
return Utils.collectAsStrings(doc, xpath);
}
public static List<String> collectAsStrings(File file, String xpath) throws Exception{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(file);
return Utils.collectAsStrings(doc, xpath);
}
public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
XPathFactory xPathfactory = XPathFactory.newInstance();
XPath path = xPathfactory.newXPath();
XPathExpression expr = path.compile(xpath);
NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
List<String> values = new ArrayList<>();
for (int i = 0; i < nodes.getLength(); i++)
values.add(nodes.item(i).getNodeValue());
return values;
}
public static void decompressGZipTo(File input, File output) throws Exception {
try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
try (FileOutputStream out = new FileOutputStream(output)){
byte[] buffer = new byte[1024];
int len;
while((len = in.read(buffer)) != -1){
out.write(buffer, 0, len);
}
}
}
}
public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
return value;
}
public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
String[] splits = value.split(",");
List<String> curated = new ArrayList<>();
for(String item : splits){
if(item == null || item.trim().length() == 0) continue;
curated.add(item.trim());
}
return curated;
}
public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Integer.parseInt(value);
} catch (NumberFormatException e) {
return defaultValue;
}
}
public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Long.parseLong(value);
} catch (NumberFormatException e) {
return defaultValue;
}
}
public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
EnumSet<E> values = EnumSet.allOf(clazz);
String value = map.get(key);
if (value == null) return defaultValue;
for(E val : values){
if(!val.name().equalsIgnoreCase(value)) continue;
return val;
}
return defaultValue;
}
public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
String value = map.get(key);
if (value == null) return defaultValue;
return Boolean.parseBoolean(value);
}
public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Charset.forName(value);
} catch (UnsupportedCharsetException e) {
return defaultValue;
}
}
public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException {
int retry =0;
while(retry < retryCount) {
try {
return IOUtils.toString(endpoint, charset);
} catch (Exception ex) {
retry += 1;
if (retry < retryCount) {
log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds");
try {
Thread.sleep(waitBetweenRetriesMillis);
} catch (Exception e) {
}
}
else{
log.debug("problem accessing url " + endpoint + ". throwing");
throw ex;
}
}
}
return null;
}
public static Boolean validateXml(String xml){
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
InputSource is = new InputSource(new StringReader(xml));
builder.parse(is);
return true;
}catch(Exception ex){
return false;
}
}
public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
int skipped = 0;
int count = 0;
for(String item : iterable) {
final org.dom4j.Document doc = new SAXReader().read(new StringReader(item));
if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) {
log.info(item);
String fileName = outDir + "/" + count++;
try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) {
w.write(item);
}
log.info("wrote " + fileName);
} else {
skipped++;
}
if (skipped % 100 == 0) {
log.info("skipped so far " + skipped);
}
if (count % 100 == 0) {
log.info("stored so far " + count);
}
}
log.info(String.format("Done! skipped %s, stored %s", skipped, count));
}
}