package eu.dnetlib.dhp.oa.graph.dump.skgif; import java.io.Serializable; import java.io.StringReader; import java.util.List; import java.util.Optional; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.skgif.model.MinGrant; import eu.dnetlib.dhp.skgif.model.MinOrganization; import eu.dnetlib.dhp.skgif.model.MinProduct; import eu.dnetlib.dhp.skgif.model.Prefixes; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; /** * @author miriam.baglioni * @Date 16/02/24 */ public class Utils implements Serializable { public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private Utils() { } public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark .read() .textFile(inputPath) .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } public static Tuple2 getOrcid(List pid) { if (!Optional.ofNullable(pid).isPresent()) return null; if (pid.size() == 0) return null; for (StructuredProperty p : pid) { if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) { return new Tuple2<>(p.getValue(), Boolean.TRUE); } } for (StructuredProperty p : pid) { if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) { return new Tuple2<>(p.getValue(), Boolean.FALSE); } } return null; } public static String getIdentifier(Prefixes entity, String id) { return entity.label + DHPUtils.md5(id); } public static String getFunderName(String fundingtree) throws DocumentException { final Document doc; doc = new SAXReader().read(new StringReader(fundingtree)); // f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText()); return ((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText(); // f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText()); } public static MinOrganization getMinOrganization(Organization o) { MinOrganization mo = new MinOrganization(); // mo.setLocal_identifier(Utils.getIdentifier(Prefixes.ORGANIZATION, o.getId())); mo.setLocal_identifier(o.getId()); if (Optional.ofNullable(o.getLegalname()).isPresent()) mo.setName(o.getLegalname().getValue()); if (Optional.ofNullable(o.getPid()).isPresent()) for (StructuredProperty pid : o.getPid()) { if (Optional.ofNullable(pid.getQualifier()).isPresent() && Optional.ofNullable(pid.getQualifier().getClassid()).isPresent()) switch (pid.getQualifier().getClassid().toLowerCase()) { case "ror": mo.setRor(pid.getValue()); break; case "isni": mo.setIsni(pid.getValue()); break; case "fundref": mo.setFundRef(pid.getValue()); break; case "ringgold": mo.setRinGold(pid.getValue()); break; case "wikidata": mo.setWikidata(pid.getValue()); break; } } return mo; } public static MinGrant getMinGrant(Project p) throws DocumentException { MinGrant mg = new MinGrant(); // mg.setLocal_identifier(Utils.getIdentifier(Prefixes.GRANT, p.getId())); mg.setLocal_identifier(p.getId()); if (Optional.ofNullable(p.getCode()).isPresent()) mg.setCode(p.getCode().getValue()); if (Optional.ofNullable(p.getFundingtree()).isPresent() && p.getFundingtree().size() > 0) mg.setFunder(getFunderName(p.getFundingtree().get(0).getValue())); if (Optional.ofNullable(p.getAcronym()).isPresent()) mg.setTitle(p.getAcronym().getValue()); else if (Optional.ofNullable(p.getTitle()).isPresent()) { mg.setTitle(p.getTitle().getValue()); } return mg; } public static MinProduct getMinProduct(R r) throws JsonProcessingException { MinProduct mp = new MinProduct(); // mp.setLocal_identifier(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, r.getId())); mp.setLocal_identifier(r.getId()); for (StructuredProperty title : r.getTitle()) { if (title.getQualifier().getClassid().equalsIgnoreCase("main title")) { mp.setTitle(title.getValue()); } } if (r.getPid() != null) for (StructuredProperty pid : r.getPid()) { switch (pid.getQualifier().getClassid().toLowerCase()) { case "doi": mp.setDoi(pid.getValue()); break; case "pmcid": mp.setPmcid(pid.getValue()); break; case "arxiv": mp.setArxivid(pid.getValue()); break; case "pmid": mp.setPmid(pid.getValue()); break; } } return mp; } }