dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java

67 lines
1.8 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.skgif;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 16/02/24
*/
public class Utils implements Serializable {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Utils() {
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
if (!Optional.ofNullable(pid).isPresent())
return null;
if (pid.size() == 0)
return null;
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
return new Tuple2<>(p.getValue(), Boolean.TRUE);
}
}
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
return new Tuple2<>(p.getValue(), Boolean.FALSE);
}
}
return null;
}
public static String getIdentifier(Prefixes entity, String id) {
return entity.label + DHPUtils.md5(id);
}
}