dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/skgif/DumpResult.java

346 lines
13 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.skgif;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.EmitPerManifestation;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.PartialResearchProduct;
import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.RelationPerProduct;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.skgif.model.*;
import eu.dnetlib.dhp.skgif.model.AccessRight;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
* @Date 06/02/24
*/
public class DumpResult implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpResult.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpResult.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/dump_result_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, workingDir + "aggrelation");
mapResult(spark, inputPath, workingDir, outputPath);
});
}
//per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza
public static <R extends Result> void mapResult(SparkSession spark, String inputPath,
String workingDir, String outputPath) {
// selection of the relevant relations from result type to other entity. Only teh semantic relevant ones are
// considered
selectRelations(spark, inputPath, workingDir);
// merge of relations and manifestation for the same result
getRelationAndManifestation(spark, workingDir, inputPath);
// dump of the result and enrichment with relevant information for relations and manifestations
dumpResult(spark, inputPath, workingDir, outputPath);
}
private static void getRelationAndManifestation(SparkSession spark, String workingDir, String inputPath) {
Dataset<RelationPerProduct> aggRelations = Utils
.readPath(spark, workingDir + "aggrelation", RelationPerProduct.class);
aggRelations.count();
ModelSupport.entityTypes
.keySet()
.stream()
.filter(ModelSupport::isResult)
.forEach(e -> {
Utils.removeOutputDir(spark, workingDir + e.name() + "/partialresearchproduct");
Dataset<Datasource> datasource = Utils
.readPath(spark, inputPath + "/datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) d -> Optional.ofNullable(d.getEoscdatasourcetype()).isPresent() &&
d.getEoscdatasourcetype().getClassid().equalsIgnoreCase("Journal archive"));
Dataset<EmitPerManifestation> man = Utils
.readPath(spark, workingDir + e.name() + "/manifestation", EmitPerManifestation.class);
Dataset<PartialResearchProduct> partialResearchProduct = man
.joinWith(datasource, man.col("instance.hostedby.key").equalTo(datasource.col("id")), "left")
.groupByKey(
(MapFunction<Tuple2<EmitPerManifestation, Datasource>, String>) t2 -> t2._1().getResultId(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<EmitPerManifestation, Datasource>, PartialResearchProduct>) (
k, v) -> {
PartialResearchProduct prp = new PartialResearchProduct();
prp.setResultId(k);
List<Manifestation> manifestationList = new ArrayList<>();
while (v.hasNext())
manifestationList.add(getManifestation(v.next()));
prp.setManifestations(manifestationList);
return prp;
}, Encoders.bean(PartialResearchProduct.class));
partialResearchProduct
.joinWith(
aggRelations, partialResearchProduct.col("resultId").equalTo(aggRelations.col("resultId")),
"left")
.map(
(MapFunction<Tuple2<PartialResearchProduct, RelationPerProduct>, PartialResearchProduct>) t2 -> {
PartialResearchProduct prp = t2._1();
if (Optional.ofNullable(t2._2()).isPresent()) {
prp.setRelated_products(t2._2().getRelatedProduct());
prp.setRelevant_organizations(t2._2().getOrganizations());
prp.setFunding(t2._2().getFunding());
}
return prp;
}, Encoders.bean(PartialResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/partialresearchproduct");
});
}
private static Manifestation getManifestation(Tuple2<EmitPerManifestation, Datasource> t2) {
// se il lato sinistro c'e' allora ho la biblio e la venue
// se non c'e' allora ho solo gli altri valori
EmitPerManifestation epm = t2._1();
Manifestation manifestation = new Manifestation();
manifestation.setProduct_local_type(epm.getInstance().getInstancetype().getClassname());
manifestation.setProduct_local_type_schema(epm.getInstance().getInstancetype().getSchemename());
if (Optional.ofNullable(epm.getInstance().getDateofacceptance()).isPresent())
manifestation
.setDates(
Arrays
.asList(
Dates.newInstance(epm.getInstance().getDateofacceptance().getValue(), "publishing")));
if (Optional.ofNullable(epm.getInstance().getRefereed()).isPresent())
switch (epm.getInstance().getRefereed().getClassid()) {
case "0000":
manifestation.setPeer_review(PeerReview.UNAVAILABLE.label);
break;
case "0001":
manifestation.setPeer_review(PeerReview.PEER_REVIEWED.label);
break;
case "0002":
manifestation.setPeer_review(PeerReview.NON_PEER_REVIEWED.label);
break;
}
manifestation.setMetadata_curation("unavailable");
if (Optional.ofNullable(epm.getInstance().getAccessright()).isPresent())
switch (epm.getInstance().getAccessright().getClassid()) {
case "OPEN":
case "OPEN DATA":
case "OPEN SOURCE":
manifestation.setAccess_right(AccessRight.OPEN.label);
break;
case "CLOSED":
manifestation.setAccess_right(AccessRight.CLOSED.label);
break;
case "RESTRICTED":
manifestation.setAccess_right(AccessRight.RESTRICTED.label);
break;
case "EMBARGO":
case "12MONTHS":
case "6MONTHS":
manifestation.setAccess_right(AccessRight.EMBARGO.label);
break;
default:
manifestation.setAccess_right(AccessRight.UNAVAILABLE.label);
}
manifestation
.setLicence(
Optional
.ofNullable(epm.getInstance().getLicense())
.map(value -> value.getValue())
.orElse(null));
manifestation
.setUrl(
Optional
.ofNullable(epm.getInstance().getUrl())
.map(value -> value.get(0))
.orElse(null));
if (Optional.ofNullable(epm.getInstance().getPid()).isPresent() && epm.getInstance().getPid().size() > 0) {
manifestation.setPid(epm.getInstance().getPid().get(0).getValue());
}
if (Optional.ofNullable(t2._2()).isPresent()) {
manifestation.setBiblio(getBiblio(epm));
if (Optional.ofNullable(t2._2().getJournal().getIssnPrinted()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnPrinted()));
else if (Optional.ofNullable(t2._2().getJournal().getIssnOnline()).isPresent())
manifestation.setVenue(Utils.getIdentifier(Prefixes.VENUE, t2._1().getJournal().getIssnOnline()));
}
manifestation
.setHosting_datasource(Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()));
return manifestation;
}
private static Biblio getBiblio(EmitPerManifestation epm) {
Biblio biblio = new Biblio();
biblio.setEdition(epm.getJournal().getEdition());
biblio.setIssue(epm.getJournal().getIss());
biblio.setPublisher(epm.getPublisher());
biblio.setVolume(epm.getJournal().getVol());
biblio.setEnd_page(epm.getJournal().getEp());
biblio.setStart_page(epm.getJournal().getSp());
return biblio;
}
private static <R extends Result> void dumpResult(SparkSession spark, String inputPath, String workingDir,
String outputPath) {
ModelSupport.entityTypes
.keySet()
.parallelStream()
.filter(ModelSupport::isResult)
.forEach(e -> {
Class<R> resultClazz = ModelSupport.entityTypes.get(e);
Utils.removeOutputDir(spark, workingDir + e.name() + "/researchproduct");
Dataset<R> results = Utils.readPath(spark, inputPath + e.name(), resultClazz);
Dataset<PartialResearchProduct> prr = Utils
.readPath(spark, workingDir + e.name() + "/partialresearchproduct", PartialResearchProduct.class);
results
.joinWith(prr, results.col("id").equalTo(prr.col("resultId")), "left")
.map((MapFunction<Tuple2<R, PartialResearchProduct>, ResearchProduct>) t2 -> {
ResearchProduct rp = ResultMapper.map(t2._1());
if (Optional.ofNullable(t2._2()).isPresent()) {
if (Optional.ofNullable(t2._2().getRelated_products()).isPresent())
rp.setRelated_products(t2._2().getRelated_products());
if (Optional.ofNullable(t2._2().getFunding()).isPresent())
rp.setFunding(t2._2().getFunding());
if (Optional.ofNullable(t2._2().getRelevant_organizations()).isPresent())
rp.setRelevant_organizations(t2._2().getRelevant_organizations());
if (Optional.ofNullable(t2._2().getManifestations()).isPresent())
rp.setManifestations(t2._2().getManifestations());
}
return rp;
}, Encoders.bean(ResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + e.name() + "/researchproduct");
});
Dataset<ResearchProduct> researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class));
for (EntityType e : ModelSupport.entityTypes.keySet()) {
if (ModelSupport.isResult(e))
researchProducts = researchProducts
.union(Utils.readPath(spark, workingDir + e.name() + "/researchproduct", ResearchProduct.class));
}
researchProducts
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "ResearchProduct");
}
private static void selectRelations(SparkSession spark, String inputPath, String workingDir) {
Dataset<Relation> relation = Utils
.readPath(
spark,
inputPath + "relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible())
.filter(
(FilterFunction<Relation>) r -> r
.getRelClass()
.equalsIgnoreCase(RelationType.RESULT_AFFILIATIED_TO_ORGANIZATION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.RESULT_OUTCOME_FUNDING.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.SUPPLEMENT.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.DOCUMENTS.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.PART.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.VERSION.label) ||
r.getRelClass().equalsIgnoreCase(RelationType.CITATION.label));
relation
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Relation, RelationPerProduct>) (k, v) -> {
RelationPerProduct rpp = new RelationPerProduct();
rpp.setResultId(k);
Map<String, List<String>> remainignRelations = new HashMap<>();
while (v.hasNext()) {
Relation rel = v.next();
String target = rel.getTarget();
String relClass = rel.getRelClass();
switch (rel.getRelClass().toLowerCase()) {
case "hasauthorinstitution":
rpp.getOrganizations().add(Utils.getIdentifier(Prefixes.ORGANIZATION, target));
break;
case "isproducedby":
rpp.getFunding().add(Utils.getIdentifier(Prefixes.GRANT, target));
break;
default:
if (!remainignRelations.keySet().contains(relClass))
remainignRelations.put(relClass, new ArrayList<>());
remainignRelations
.get(relClass)
.add(Utils.getIdentifier(Prefixes.RESEARCH_PRODUCT, target));
}
}
for (String key : remainignRelations.keySet())
rpp.getRelatedProduct().add(Relations.newInstance(key, remainignRelations.get(key)));
return rpp;
}, Encoders.bean(RelationPerProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir + "/aggrelation");
}
}