package eu.dnetlib.dhp.oa.graph.dump.skgif; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.filterentities.MasterDuplicate; import eu.dnetlib.dhp.oa.graph.dump.skgif.beans.*; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.skgif.model.*; import eu.dnetlib.dhp.skgif.model.AccessRight; import scala.Tuple2; /** * @author miriam.baglioni * @Date 06/02/24 */ public class DumpResearchProduct implements Serializable { private static final Logger log = LoggerFactory.getLogger(DumpResearchProduct.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( DumpResearchProduct.class .getResourceAsStream( "/eu/dnetlib/dhp/oa/graph/dump/skgif/emit_biblio_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); final String workingDir = parser.get("workingDir"); log.info("workingDir: {}", workingDir); final String masterDuplicatePath = parser.get("masterDuplicatePath"); log.info("masterDuplicatePath: {}", masterDuplicatePath); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { Utils.removeOutputDir(spark, outputPath + "products"); emitFromResult(spark, inputPath, outputPath, workingDir, masterDuplicatePath); }); } // per ogni result emetto id + journal se esiste + istanza + hosted by dell'istanza public static void emitFromResult(SparkSession spark, String inputPath, String outputPath, String workingDir, String masterDuplicatePath) { dumpResearchProduct(spark, inputPath, workingDir, masterDuplicatePath); moveDumpedProducts(spark, workingDir, outputPath); } private static void moveDumpedProducts(SparkSession spark, String workingDir, String outputPath) { Dataset researchProducts = spark.emptyDataset(Encoders.bean(ResearchProduct.class)); for (EntityType e : ModelSupport.entityTypes.keySet()) { if (ModelSupport.isResult(e)) researchProducts = researchProducts .union( Utils .readPath( spark, workingDir + "products" + e.name() + "/researchproduct", ResearchProduct.class)); } researchProducts .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "products"); } private static void dumpResearchProduct(SparkSession spark, String inputPath, String workingDir, String masterDuplicatePath) { List masterDuplicateList = Utils .readPath(spark, masterDuplicatePath, MasterDuplicate.class) .collectAsList(); ModelSupport.entityTypes.keySet().forEach(e -> { if (ModelSupport.isResult(e)) { Class resultClazz = ModelSupport.entityTypes.get(e); if (e.name().equalsIgnoreCase("publication")) { dumpPublication(spark, inputPath, workingDir, e, resultClazz, masterDuplicateList); } else { dumpOtherResults(spark, inputPath, workingDir, e, resultClazz, masterDuplicateList); } includeRelevantOrganization(spark, workingDir, e); includeFunding(spark, workingDir, e); includeRelatedProducts(spark, workingDir, e); } }); } private static void includeRelatedProducts(SparkSession spark, String workingDir, EntityType e) { Dataset pprWitGrants = spark .read() .schema(Encoders.bean(ResearchProduct.class).schema()) .json(workingDir + "products" + e.name() + "/temp_researchproductgrant") .as(Encoders.bean(ResearchProduct.class)); Dataset relatedResults = Utils .readPath(spark, workingDir + "/relations/related_products", ProductsRelation.class); pprWitGrants .joinWith( relatedResults, pprWitGrants.col("local_identifier").equalTo(relatedResults.col("resultId")), "left") .map( (MapFunction, ResearchProduct>) t2 -> { if (t2._2() == null) return t2._1(); t2._1().setRelated_products(t2._2().getRelated_products()); return t2._1(); }, Encoders.bean(ResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "products" + e.name() + "/researchproduct"); Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchproductgrant"); } private static void includeFunding(SparkSession spark, String workingDir, EntityType e) { Dataset prrWithAffiliation = spark .read() .schema(Encoders.bean(ResearchProduct.class).schema()) .json(workingDir + "products" + e.name() + "/temp_researchproductaff") .as(Encoders.bean(ResearchProduct.class)); Dataset grants = Utils .readPath(spark, workingDir + "relations/funding", GrantRelation.class); // Dataset pprWitGrants = prrWithAffiliation .joinWith( grants, prrWithAffiliation.col("local_identifier").equalTo(grants.col("resultId")), "left") .map((MapFunction, ResearchProduct>) t2 -> { if (t2._2() == null) return t2._1(); t2._1().setFunding(t2._2().getFunding()); return t2._1(); }, Encoders.bean(ResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "products" + e.name() + "/temp_researchproductgrant"); Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchproductaff"); } private static void includeRelevantOrganization(SparkSession spark, String workingDir, EntityType e) { Dataset affiliations = Utils .readPath( spark, workingDir + "relations/result_relevant_organizations", ExtendingOrganization.class); Dataset partialResearchProduct = spark .read() .schema(Encoders.bean(ResearchProduct.class).schema()) .json(workingDir + "products" + e.name() + "/temp_researchProduct") .as(Encoders.bean(ResearchProduct.class)); // Dataset prrWithAffiliation = partialResearchProduct .joinWith( affiliations, partialResearchProduct.col("local_identifier").equalTo(affiliations.col("entityId")), "left") .map( (MapFunction, ResearchProduct>) t2 -> { if (t2._2() == null) return t2._1(); t2._1().setRelevant_organizations(t2._2().getRelevant_organization()); return t2._1(); }, Encoders.bean(ResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "products" + e.name() + "/temp_researchproductaff"); Utils.removeOutputDir(spark, workingDir + "products" + e.name() + "/temp_researchProduct"); } private static void dumpOtherResults(SparkSession spark, String inputPath, String workingDir, EntityType e, Class resultClazz, List masterDuplicateList) { Dataset results = Utils.readPath(spark, inputPath + e.name(), resultClazz); results.map((MapFunction) r -> { ArrayList journalHbIds = new ArrayList<>(); ResearchProduct rp = ResultMapper.map(r); rp .setManifestations( r .getInstance() .stream() .map(i -> getManifestation(i, journalHbIds, r, masterDuplicateList)) .collect(Collectors.toList())); return rp; }, Encoders.bean(ResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "products" + e.name() + "/temp_researchProduct"); } private static void dumpPublication(SparkSession spark, String inputPath, String workingDir, EntityType e, Class resultClazz, List masterDuplicateList) { Dataset> resultHostedBy = Utils .readPath(spark, inputPath + e.name(), resultClazz) .flatMap( (FlatMapFunction>) p -> p .getInstance() .stream() .map(i -> new Tuple2<>(p.getId(), i.getHostedby().getKey())) .collect(Collectors.toList()) .iterator(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); Dataset journalIds = spark .read() .schema(Encoders.bean(Datasource.class).schema()) .json(inputPath + "/datasource") .filter( "datainfo.deletedbyinference != true and " + "eoscdatasourcetype.classid == 'Journal archive' ") .select("id"); Dataset journalHostedByPerResult = resultHostedBy .join( journalIds, resultHostedBy.col("_2").equalTo(journalIds.col("id")), "leftsemi") .selectExpr("_1 as id", "_2 as journalHostedBy"); Dataset results = Utils.readPath(spark, inputPath + e.name(), Publication.class); results .joinWith( journalHostedByPerResult, results .col("id") .equalTo(journalHostedByPerResult.col("id")), "left") .groupByKey( (MapFunction, String>) t2 -> t2._1().getId(), Encoders.STRING()) .mapGroups((MapGroupsFunction, ResearchProduct>) (k, v) -> { ArrayList journalHbIds = new ArrayList<>(); Tuple2 first = v.next(); if (Optional.ofNullable(first._2()).isPresent()) journalHbIds.add(first._2().getAs("journalHostedBy")); v.forEachRemaining(value -> journalHbIds.add(value._2().getAs("journalHostedBy"))); Publication p = first._1(); ResearchProduct rp = ResultMapper.map(p); rp .setManifestations( p .getInstance() .stream() .map(i -> getManifestation(i, journalHbIds, p, masterDuplicateList)) .collect(Collectors.toList())); return rp; }, Encoders.bean(ResearchProduct.class)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(workingDir + "products" + e.name() + "/temp_researchProduct"); } @NotNull private static Manifestation getManifestation(Instance i, ArrayList journalHbIds, R p, List eoscDatasourceIdMap) { Manifestation m = new Manifestation(); m.setProduct_local_type(i.getInstancetype().getClassname()); m.setProduct_local_type_schema(i.getInstancetype().getSchemename()); m.setPeer_review(getPeerReviewd(i)); m.setAccess_right(getAccessRigth(i)); m .setLicence( getLicence(i)); if (Optional.ofNullable(i.getUrl()).isPresent() && i.getUrl().size() > 0) m.setUrl(i.getUrl().get(0)); else m.setUrl(null); if (Optional.ofNullable(i.getPid()).isPresent() && i.getPid().size() > 0) { m.setPid(i.getPid().get(0).getValue()); } if (Optional.ofNullable(i.getDateofacceptance()).isPresent()) m .setDates( Arrays .asList( Dates.newInstance(i.getDateofacceptance().getValue(), "publishing"))); if (p instanceof Publication) { if (journalHbIds.contains(i.getHostedby().getKey()) && Optional.ofNullable(((Publication) p).getJournal()).isPresent()) { Biblio biblio = getBiblio(((Publication) p).getJournal()); if (Optional.ofNullable(p.getPublisher()).isPresent()) biblio.setPublisher(p.getPublisher().getValue()); m.setBiblio(biblio); if (Optional.ofNullable(((Publication) p).getJournal().getIssnPrinted()).isPresent()) m .setVenue( MinVenue .newInstance( Utils .getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnPrinted()), i.getHostedby().getValue())); else if (Optional.ofNullable(((Publication) p).getJournal().getIssnOnline()).isPresent()) m .setVenue( MinVenue .newInstance( Utils.getIdentifier(Prefixes.VENUE, ((Publication) p).getJournal().getIssnOnline()), i.getHostedby().getValue())); } } List eoscDsIds = eoscDatasourceIdMap .stream() .filter( dm -> dm .getGraphId() .equals(i.getHostedby().getKey()) || dm .getGraphId() .equals(i.getCollectedfrom().getKey())) .collect(Collectors.toList()); if (eoscDsIds.size() > 0) { m .setEoscId( eoscDsIds .stream() .map(dm -> dm.getEoscId()) .collect(Collectors.toList())); } m .setHosting_datasource( MinVenue .newInstance( // Utils.getIdentifier(Prefixes.DATASOURCE, epm.getInstance().getHostedby().getKey()), i.getHostedby().getKey(), i.getHostedby().getValue())); return m; } private static Biblio getBiblio(Journal epm) { Biblio biblio = new Biblio(); if (Optional.ofNullable(epm.getEdition()).isPresent()) biblio.setEdition(epm.getEdition()); if (Optional.ofNullable(epm.getIss()).isPresent()) biblio.setIssue(epm.getIss()); if (Optional.ofNullable(epm.getVol()).isPresent()) biblio.setVolume(epm.getVol()); if (Optional.ofNullable(epm.getEp()).isPresent()) biblio.setEnd_page(epm.getEp()); if (Optional.ofNullable(epm.getSp()).isPresent()) biblio.setStart_page(epm.getSp()); return biblio; } @Nullable private static String getLicence(Instance i) { return Optional .ofNullable(i.getLicense()) .map(value -> value.getValue()) .orElse(null); } private static String getAccessRigth(Instance i) { if (Optional.ofNullable(i.getAccessright()).isPresent()) switch (i.getAccessright().getClassid()) { case "OPEN": case "OPEN DATA": case "OPEN SOURCE": return AccessRight.OPEN.label; case "CLOSED": return AccessRight.CLOSED.label; case "RESTRICTED": return AccessRight.RESTRICTED.label; case "EMBARGO": case "12MONTHS": case "6MONTHS": return AccessRight.EMBARGO.label; default: return AccessRight.UNAVAILABLE.label; } return AccessRight.UNAVAILABLE.label; } private static String getPeerReviewd(Instance i) { if (Optional.ofNullable(i.getRefereed()).isPresent()) switch (i.getRefereed().getClassid()) { case "0000": return PeerReview.UNAVAILABLE.label; case "0001": return PeerReview.PEER_REVIEWED.label; case "0002": return PeerReview.NON_PEER_REVIEWED.label; } return PeerReview.UNAVAILABLE.label; } }