dnet-hadoop/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java

341 lines
10 KiB
Java

package eu.dnetlib.dhp.utils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import net.minidev.json.JSONArray;
import scala.collection.JavaConverters;
import scala.collection.Seq;
/**
* The type Dhp utils.
*/
public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
private DHPUtils() {
}
/**
* To seq seq.
*
* @param list the list
* @return the seq
*/
public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
/**
* Md 5 string.
*
* @param s the s
* @return the string
*/
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
log.error("Error creating id from {}", s);
return null;
}
}
private static Pair<Qualifier, Qualifier> searchTypeInVocabularies(final String aType, final VocabularyGroup vocabularies) {
if (StringUtils.isNotBlank(aType)) {
final Qualifier typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, aType);
if (typeQualifier != null)
return new ImmutablePair<>(typeQualifier, vocabularies.getSynonymAsQualifier(
ModelConstants.DNET_RESULT_TYPOLOGIES,
typeQualifier.getClassid()
));
}
return null;
}
/**
* Retrieve oaf type from vocabulary pair.
*
* This method tries to find the correct oaf type general and instance type from
* vocabularies giving this order:
* 1 - search a vocabulary synonym from subResourceType
* 2 - search a vocabulary synonym from otherResourceType
*
*
*
* @param resourceTypeGeneral the resource type general
* @param subResourceType the sub resource type
* @param otherResourceType the other resource type
* @param vocabularies the vocabularies
* @return the pair
*/
public static Pair<Qualifier, Qualifier> retrieveOAFTypeFromVocabulary(final String resourceTypeGeneral , final String subResourceType, final String otherResourceType, final VocabularyGroup vocabularies ) {
if (StringUtils.isNotBlank(subResourceType)) {
Pair<Qualifier, Qualifier> result = searchTypeInVocabularies(subResourceType, vocabularies);
if (result!= null)
return result;
}
if (StringUtils.isNotBlank(otherResourceType)) {
Pair<Qualifier, Qualifier> result = searchTypeInVocabularies(otherResourceType, vocabularies);
if (result!= null)
return result;
}
if (StringUtils.isNotBlank(resourceTypeGeneral)) {
Pair<Qualifier, Qualifier> result = searchTypeInVocabularies(resourceTypeGeneral, vocabularies);
return result;
}
return null;
}
/**
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
* by he given format, layout, interpretation
*
* @param mdstoreManagerUrl the URL of the mdstore manager service
* @param format the mdstore format
* @param layout the mdstore layout
* @param interpretation the mdstore interpretation
* @param includeEmpty include Empty mdstores
* @return the set of hdfs paths
* @throws IOException in case of HTTP communication issues
*/
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format,
final String layout,
final String interpretation,
boolean includeEmpty) throws IOException {
final String url = mdstoreManagerUrl + "/mdstores/";
final ObjectMapper objectMapper = new ObjectMapper();
final HttpGet req = new HttpGet(url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)
.filter(md -> md.getFormat().equalsIgnoreCase(format))
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
.filter(md -> includeEmpty || md.getSize() > 0)
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
.collect(Collectors.toSet());
}
}
}
/**
* Generate identifier string.
*
* @param originalId the original id
* @param nsPrefix the ns prefix
* @return the string
*/
public static String generateIdentifier(final String originalId, final String nsPrefix) {
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
}
/**
* Generate unresolved identifier string.
*
* @param pid the pid
* @param pidType the pid type
* @return the string
*/
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
}
/**
* Gets j path string.
*
* @param jsonPath the json path
* @param json the json
* @return the j path string
*/
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return o.toString();
} catch (Exception e) {
return "";
}
}
/**
* The constant MAPPER.
*/
public static final ObjectMapper MAPPER = new ObjectMapper();
/**
* Write hdfs file.
*
* @param conf the conf
* @param content the content
* @param path the path
* @throws IOException the io exception
*/
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
throws IOException {
log.info("writing file {}, size {}", path, content.length());
try (FileSystem fs = FileSystem.get(conf);
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
os.write(content.getBytes(StandardCharsets.UTF_8));
os.flush();
}
}
/**
* Read hdfs file string.
*
* @param conf the conf
* @param path the path
* @return the string
* @throws IOException the io exception
*/
public static String readHdfsFile(Configuration conf, String path) throws IOException {
log.info("reading file {}", path);
try (FileSystem fs = FileSystem.get(conf)) {
final Path p = new Path(path);
if (!fs.exists(p)) {
throw new FileNotFoundException(path);
}
return IOUtils.toString(fs.open(p));
}
}
/**
* Read hdfs file as t.
*
* @param <T> the type parameter
* @param conf the conf
* @param path the path
* @param clazz the clazz
* @return the t
* @throws IOException the io exception
*/
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
}
/**
* Save dataset.
*
* @param <T> the type parameter
* @param mdstore the mdstore
* @param targetPath the target path
*/
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
log.info("saving dataset in: {}", targetPath);
mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
/**
* Gets hadoop configuration.
*
* @param nameNode the name node
* @return the hadoop configuration
*/
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
/**
* Populate oozie env.
*
* @param report the report
* @throws IOException the io exception
*/
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
report.forEach((k, v) -> props.setProperty(k, v));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
}
/**
* Populate oozie env.
*
* @param paramName the param name
* @param value the value
* @throws IOException the io exception
*/
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
Map<String, String> report = Maps.newHashMap();
report.put(paramName, value);
populateOOZIEEnv(report);
}
}