[SKG-IF-EOSC] added possibility to filter for LOT1 constraints and results related to specific communities

This commit is contained in:
Miriam Baglioni 2024-04-08 10:26:48 +02:00
parent f85db930d9
commit 87370338bb
5 changed files with 283 additions and 16 deletions

View File

@ -0,0 +1,88 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* @author miriam.baglioni
* @Date 20/03/24
*/
public class SelectCommunityEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectCommunityEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
FilterEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/eosc_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String filterPath = parser.get("filterPath");
log.info("filterPath: {}", filterPath);
final String communityId = parser.get("communityId");
log.info("communityId: {}", communityId);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectEntities(spark, inputPath, filterPath, communityId);
});
}
private static <R extends Result> void selectEntities(SparkSession spark, String inputPath, String filterPath, String communityId) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
spark
.read()
.schema(Encoders.bean(Result.class).schema())
.json(inputPath + e.name())
.where("datainfo.deletedbyinference != true and datainfo.invisible != true")
.select("id", "context")
.where("array_contains(context.id,'"+communityId+"')")
.drop("context")
.distinct()
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + e.name() + "_ids");
}
});
}
}

View File

@ -28,7 +28,6 @@ import scala.Tuple2;
*/
public class SelectEOSCEntities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectEOSCEntities.class);
private static final String B2FIND_IDENTIFIER = "10|re3data_____::730f562f9efe8a3b3742d2da510d4335";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
@ -68,17 +67,6 @@ public class SelectEOSCEntities implements Serializable {
private static <R extends Result> void selectEntities(SparkSession spark, String inputPath, String filterPath) {
ModelSupport.entityTypes.keySet().forEach(e -> {
if (ModelSupport.isResult(e)) {
// Utils
// .readPath(spark, inputPath + e.name(), ModelSupport.entityTypes.get(e))
// .filter(
// (FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference()
// && !r.getDataInfo().getInvisible()
// && (r.getContext().stream().anyMatch(c -> c.getId().equals("eosc")) ||
// r
// .getCollectedfrom()
// .stream()
// .anyMatch(cf -> cf.getValue().equalsIgnoreCase("B2FIND"))))
// .map((MapFunction<R, String>) r -> r.getId(), Encoders.STRING())
spark
.read()
@ -94,7 +82,6 @@ public class SelectEOSCEntities implements Serializable {
.option("compression", "gzip")
.parquet(filterPath + e.name() + "_ids");
//
}
});

View File

@ -0,0 +1,133 @@
package eu.dnetlib.dhp.oa.graph.dump.filterentities;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.spark.sql.functions.expr;
import static org.apache.spark.sql.functions.max;
import static org.apache.spark.sql.functions.col;
import java.io.Serializable;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* @author miriam.baglioni
* @Date 20/03/24
*/
public class SelectLOT1Entities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SelectLOT1Entities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
FilterEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/skgif/eosc_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String filterPath = parser.get("filterPath");
log.info("filterPath: {}", filterPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectEntities(spark, inputPath, filterPath);
});
}
private static <R extends Result> void selectEntities(SparkSession spark, String inputPath, String filterPath) {
selectPublications(spark,inputPath,filterPath);
selectDataset(spark,inputPath,filterPath);
selectSoftware(spark,inputPath,filterPath);
selectOthers(spark,inputPath,filterPath);
}
private static void selectOthers(SparkSession spark, String inputPath, String filterPath) {
spark.read().schema(Encoders.bean(OtherResearchProduct.class).schema())
.json(inputPath + "otherresearchproduct")
.where("datainfo.deletedbyinference != true AND datainfo.invisible != true")
.selectExpr("id", "instance", "explode(pid) as pid").where("pid.qualifier.classid IN ('doi', 'handle')") // filter by pid type
.selectExpr("id", "explode(instance) as instance")
.withColumn("CCL", expr("CASE WHEN instance.license.value LIKE 'CC%' OR instance.license.value LIKE '%/creativecommons.org/%' THEN 1 ELSE 0 END"))
.groupBy("id")
.agg(max(col("CCL")).as("CCL"))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + "otherresearchproduct_ids");
}
private static void selectSoftware(SparkSession spark, String inputPath, String filterPath) {
spark.read().schema(Encoders.bean(Software.class).schema())
.json(inputPath + "software")
.where("datainfo.deletedbyinference != true AND datainfo.invisible != true")
.selectExpr("id", "instance", "explode(pid) as pid").where("pid.qualifier.classid IN ('doi', 'swhid')") // filter by pid type
.selectExpr("id", "explode(instance) as instance")
.withColumn("CCL", expr("CASE WHEN instance.license.value LIKE 'CC%' OR instance.license.value LIKE '%/creativecommons.org/%' THEN 1 ELSE 0 END"))
.groupBy("id")
.agg(max(col("CCL")).as("CCL"))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + "software_ids");
}
private static void selectDataset(SparkSession spark, String inputPath, String filterPath) {
spark.read().schema(Encoders.bean(Dataset.class).schema())
.json(inputPath + "dataset")
.where("datainfo.deletedbyinference != true AND datainfo.invisible != true")
.selectExpr("id", "instance", "explode(pid) as pid").where("pid.qualifier.classid IN ('doi', 'handle', 'pdb', 'ena', 'uniprot')") // filter by pid type
.selectExpr("id", "explode(instance) as instance")
.withColumn("CCL", expr("CASE WHEN instance.license.value LIKE 'CC%' OR instance.license.value LIKE '%/creativecommons.org/%' THEN 1 ELSE 0 END"))
.groupBy("id")
.agg(max(col("CCL")).as("CCL"))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + "dataset_ids");
}
private static void selectPublications(SparkSession spark, String inputPath, String filterPath) {
spark.read().schema(Encoders.bean(Publication.class).schema())
.json(inputPath + "publication")
.where("datainfo.deletedbyinference != true AND datainfo.invisible != true")
.selectExpr("id", "instance", "explode(pid) as pid").where("pid.qualifier.classid IN ('doi', 'arXiv', 'pmid', 'handle')") // filter by pid type
.selectExpr("id", "explode(instance) as instance").where("instance.instancetype.classname IN('Book', 'Article', 'Journal', 'Data Paper', 'Software Paper', 'Preprint', 'Part of book or chapter of book', 'Thesis', 'Master thesis', 'Bachelor thesis', 'Doctoral thesis', 'Conference object', 'Research', 'Other literature type')")
.withColumn("CCL", expr("CASE WHEN instance.license.value LIKE 'CC%' OR instance.license.value LIKE '%/creativecommons.org/%' THEN 1 ELSE 0 END"))
.groupBy("id")
.agg(max(col("CCL")).as("CCL"))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.parquet(filterPath + "publication_ids");
}
}

View File

@ -16,5 +16,10 @@
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
},{
"paramName": "ci",
"paramLongName": "communityId",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -77,7 +77,9 @@
<decision name="select_subset">
<switch>
<!-- This one is if I hve to select the results as we do now for the eosc futere portal-->
<case to="select_eosc_results">${wf:conf('filter') eq true}</case>
<case to="select_eosc_results">${wf:conf('filter') eq 'EOSC'}</case>
<case to="select_lot1_results">${wf:conf('filter') eq 'LOT1'}</case>
<case to="select_community_results">${wf:conf('filter') eq 'Community'}</case>
<!-- This one takes the identifier of the results matching different criteria and computed outside this code-->
<default to="filter"/>
</switch>
@ -113,7 +115,59 @@
<ok to="filter"/>
<error to="Kill"/>
</action>
<action name="select_lot1_results">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Selecting graph results ids relevant for LOT1</name>
<class>eu.dnetlib.dhp.oa.graph.dump.filterentities.SelectLOT1Entities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=4
--executor-memory=4G
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=5G
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=15000
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--filterPath</arg><arg>${filterPath}/eoscIds/</arg>
</spark>
<ok to="filter"/>
<error to="Kill"/>
</action>
<action name="select_community_results">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Selecting graph results ids relevant for Communities</name>
<class>eu.dnetlib.dhp.oa.graph.dump.filterentities.SelectCommunityEntities</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=4
--executor-memory=4G
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=5G
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=15000
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--filterPath</arg><arg>${filterPath}/eoscIds/</arg>
<arg>--communityId</arg><arg>${communityId}</arg>
</spark>
<ok to="filter"/>
<error to="Kill"/>
</action>
<action name="filter">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>