2020-04-27 14:52:31 +02:00
|
|
|
|
2019-04-03 16:05:16 +02:00
|
|
|
package eu.dnetlib.dhp.utils;
|
|
|
|
|
2021-02-06 20:12:00 +01:00
|
|
|
import java.io.*;
|
2019-04-11 15:39:29 +02:00
|
|
|
import java.nio.charset.StandardCharsets;
|
2019-04-03 16:05:16 +02:00
|
|
|
import java.security.MessageDigest;
|
2020-11-13 10:05:12 +01:00
|
|
|
import java.util.List;
|
2021-02-06 20:12:00 +01:00
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Properties;
|
2019-04-11 15:39:29 +02:00
|
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
import java.util.zip.GZIPOutputStream;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
import org.apache.commons.codec.binary.Base64;
|
|
|
|
import org.apache.commons.codec.binary.Base64OutputStream;
|
|
|
|
import org.apache.commons.codec.binary.Hex;
|
2021-02-06 20:12:00 +01:00
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.SaveMode;
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2021-02-06 20:12:00 +01:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
import com.google.common.collect.Maps;
|
2020-04-28 11:23:29 +02:00
|
|
|
import com.jayway.jsonpath.JsonPath;
|
|
|
|
|
|
|
|
import net.minidev.json.JSONArray;
|
2020-11-13 10:05:12 +01:00
|
|
|
import scala.collection.JavaConverters;
|
|
|
|
import scala.collection.Seq;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2019-04-03 16:05:16 +02:00
|
|
|
public class DHPUtils {
|
|
|
|
|
2021-02-06 20:12:00 +01:00
|
|
|
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
|
|
|
|
|
2020-11-13 10:05:12 +01:00
|
|
|
public static Seq<String> toSeq(List<String> list) {
|
|
|
|
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
|
|
|
|
}
|
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public static String md5(final String s) {
|
|
|
|
try {
|
|
|
|
final MessageDigest md = MessageDigest.getInstance("MD5");
|
2020-04-29 19:09:07 +02:00
|
|
|
md.update(s.getBytes(StandardCharsets.UTF_8));
|
2020-04-27 14:52:31 +02:00
|
|
|
return new String(Hex.encodeHex(md.digest()));
|
|
|
|
} catch (final Exception e) {
|
|
|
|
System.err.println("Error creating id");
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public static String generateIdentifier(final String originalId, final String nsPrefix) {
|
|
|
|
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public static String compressString(final String input) {
|
|
|
|
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
|
|
|
|
Base64OutputStream b64os = new Base64OutputStream(out)) {
|
|
|
|
GZIPOutputStream gzip = new GZIPOutputStream(b64os);
|
|
|
|
gzip.write(input.getBytes(StandardCharsets.UTF_8));
|
|
|
|
gzip.close();
|
|
|
|
return out.toString();
|
|
|
|
} catch (Throwable e) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2019-04-11 15:39:29 +02:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public static String decompressString(final String input) {
|
|
|
|
byte[] byteArray = Base64.decodeBase64(input.getBytes());
|
|
|
|
int len;
|
|
|
|
try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray)));
|
|
|
|
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) {
|
|
|
|
byte[] buffer = new byte[1024];
|
|
|
|
while ((len = gis.read(buffer)) != -1) {
|
|
|
|
bos.write(buffer, 0, len);
|
|
|
|
}
|
|
|
|
return bos.toString();
|
|
|
|
} catch (Exception e) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2019-04-11 15:39:29 +02:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public static String getJPathString(final String jsonPath, final String json) {
|
|
|
|
try {
|
|
|
|
Object o = JsonPath.read(json, jsonPath);
|
|
|
|
if (o instanceof String)
|
|
|
|
return (String) o;
|
|
|
|
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
|
|
|
return (String) ((JSONArray) o).get(0);
|
|
|
|
return o.toString();
|
|
|
|
} catch (Exception e) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
2021-02-06 20:12:00 +01:00
|
|
|
|
|
|
|
public static final ObjectMapper MAPPER = new ObjectMapper();
|
|
|
|
|
|
|
|
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
|
|
|
|
throws IOException {
|
|
|
|
|
|
|
|
log.info("writing file {}, size {}", path, content.length());
|
|
|
|
try (FileSystem fs = FileSystem.get(conf);
|
|
|
|
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
|
|
|
|
os.write(content.getBytes(StandardCharsets.UTF_8));
|
|
|
|
os.flush();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static String readHdfsFile(Configuration conf, String path) throws IOException {
|
|
|
|
log.info("reading file {}", path);
|
|
|
|
|
|
|
|
try (FileSystem fs = FileSystem.get(conf)) {
|
|
|
|
final Path p = new Path(path);
|
|
|
|
if (!fs.exists(p)) {
|
|
|
|
throw new FileNotFoundException(path);
|
|
|
|
}
|
|
|
|
return IOUtils.toString(fs.open(p));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
|
|
|
|
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
|
|
|
|
}
|
|
|
|
|
|
|
|
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
|
|
|
|
log.info("saving dataset in: {}", targetPath);
|
|
|
|
mdstore
|
|
|
|
.write()
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
.format("parquet")
|
|
|
|
.save(targetPath);
|
|
|
|
}
|
|
|
|
|
|
|
|
public static Configuration getHadoopConfiguration(String nameNode) {
|
|
|
|
// ====== Init HDFS File System Object
|
|
|
|
Configuration conf = new Configuration();
|
|
|
|
// Set FileSystem URI
|
|
|
|
conf.set("fs.defaultFS", nameNode);
|
|
|
|
// Because of Maven
|
|
|
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
|
|
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
|
|
|
|
|
|
|
System.setProperty("hadoop.home.dir", "/");
|
|
|
|
return conf;
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
|
|
|
|
File file = new File(System.getProperty("oozie.action.output.properties"));
|
|
|
|
Properties props = new Properties();
|
|
|
|
report.forEach((k, v) -> props.setProperty(k, v));
|
|
|
|
|
|
|
|
try (OutputStream os = new FileOutputStream(file)) {
|
|
|
|
props.store(os, "");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
|
|
|
|
Map<String, String> report = Maps.newHashMap();
|
|
|
|
report.put(paramName, value);
|
|
|
|
|
|
|
|
populateOOZIEEnv(report);
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
}
|