dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/Utils.java

167 lines
5.2 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.skgif;
import java.io.Serializable;
import java.io.StringReader;
import java.util.List;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.skgif.model.MinGrant;
import eu.dnetlib.dhp.skgif.model.MinOrganization;
import eu.dnetlib.dhp.skgif.model.MinProduct;
import eu.dnetlib.dhp.skgif.model.Prefixes;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 16/02/24
*/
public class Utils implements Serializable {
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Utils() {
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
if (!Optional.ofNullable(pid).isPresent())
return null;
if (pid.size() == 0)
return null;
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
return new Tuple2<>(p.getValue(), Boolean.TRUE);
}
}
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
return new Tuple2<>(p.getValue(), Boolean.FALSE);
}
}
return null;
}
public static String getIdentifier(Prefixes entity, String id) {
return entity.label + DHPUtils.md5(id);
}
public static String getFunderName(String fundingtree) throws DocumentException {
final Document doc;
doc = new SAXReader().read(new StringReader(fundingtree));
// f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText();
// f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
}
public static MinOrganization getMinOrganization(Organization o) {
MinOrganization mo = new MinOrganization();
// mo.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId()));
mo.setLocal_identifier(o.getId());
if (Optional.ofNullable(o.getLegalname()).isPresent())
mo.setName(o.getLegalname().getValue());
if (Optional.ofNullable(o.getPid()).isPresent())
for (StructuredProperty pid : o.getPid()) {
if (Optional.ofNullable(pid.getQualifier()).isPresent() &&
Optional.ofNullable(pid.getQualifier().getClassid()).isPresent())
switch (pid.getQualifier().getClassid().toLowerCase()) {
case "ror":
mo.setRor(pid.getValue());
break;
case "isni":
mo.setIsni(pid.getValue());
break;
case "fundref":
mo.setFundRef(pid.getValue());
break;
case "ringgold":
mo.setRinGold(pid.getValue());
break;
case "wikidata":
mo.setWikidata(pid.getValue());
break;
}
}
return mo;
}
public static MinGrant getMinGrant(Project p) throws DocumentException {
MinGrant mg = new MinGrant();
// mg.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, p.getId()));
mg.setLocal_identifier(p.getId());
if (Optional.ofNullable(p.getCode()).isPresent())
mg.setCode(p.getCode().getValue());
if (Optional.ofNullable(p.getFundingtree()).isPresent() && p.getFundingtree().size() > 0)
mg.setFunder(getFunderName(p.getFundingtree().get(0).getValue()));
if (Optional.ofNullable(p.getAcronym()).isPresent())
mg.setTitle(p.getAcronym().getValue());
else if (Optional.ofNullable(p.getTitle()).isPresent()) {
mg.setTitle(p.getTitle().getValue());
}
return mg;
}
public static <R extends Result> MinProduct getMinProduct(R r) throws JsonProcessingException {
MinProduct mp = new MinProduct();
// mp.setLocal_identifier(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, r.getId()));
mp.setLocal_identifier(r.getId());
for (StructuredProperty title : r.getTitle()) {
if (title.getQualifier().getClassid().equalsIgnoreCase("main title")) {
mp.setTitle(title.getValue());
}
}
if (r.getPid() != null)
for (StructuredProperty pid : r.getPid()) {
switch (pid.getQualifier().getClassid().toLowerCase()) {
case "doi":
mp.setDoi(pid.getValue());
break;
case "pmcid":
mp.setPmcid(pid.getValue());
break;
case "arxiv":
mp.setArxivid(pid.getValue());
break;
case "pmid":
mp.setPmid(pid.getValue());
break;
}
}
return mp;
}
}