package eu.dnetlib.dhp.subjecttoresultfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.*; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Subject; import scala.Tuple2; /** * @author miriam.baglioni * @Date 04/10/22 * * This is for the selection of result with subject in subjetClassList */ public class PrepareResultResultStep1 implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareResultResultStep1.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( PrepareResultResultStep1.class .getResourceAsStream( "/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = isSparkSessionManaged(parser); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); final String resultClassName = parser.get("resultTableName"); log.info("resultTableName: {}", resultClassName); Class resultClazz = (Class) Class.forName(resultClassName); final String resultType = parser.get("resultType"); log.info("resultType: {}", resultType); final List subjectClassList = Arrays .asList( parser.get("subjectlist").split(";")) .stream() .map(s -> s.toLowerCase()) .collect(Collectors.toList()); log.info("subjectClassList: {}", subjectClassList); final List allowedSemRel = Arrays .asList( parser.get("allowedsemrels").split(";")) .stream() .map(s -> s.toLowerCase()) .collect(Collectors.toList()); log.info("allowedSemRel: {}", allowedSemRel); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); prepareInfo(spark, inputPath, outputPath, subjectClassList, allowedSemRel, resultClazz, resultType); }); } private static void prepareInfo(SparkSession spark, String inputPath, String outputPath, List subjectClassList, List allowedSemRel, Class resultClazz, String resultType) { Dataset result = readPath(spark, inputPath + "/" + resultType, resultClazz) .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible() && Optional.ofNullable(r.getSubject()).isPresent() && r .getSubject() .stream() .anyMatch(s -> subjectClassList.contains(s.getQualifier().getClassid().toLowerCase()))); Dataset relation = readPath(spark, inputPath + "/relation", Relation.class) .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && allowedSemRel.contains(r.getRelClass().toLowerCase())); result .joinWith(relation, result.col("id").equalTo(relation.col("source")), "right") .groupByKey((MapFunction, String>) t2 -> t2._2().getTarget(), Encoders.STRING()) .mapGroups( (MapGroupsFunction, ResultSubjectList>) (k, it) -> getResultSubjectList(subjectClassList, k, it), Encoders.bean(ResultSubjectList.class)) .filter(Objects::nonNull) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "/" + resultType); } private static ResultSubjectList getResultSubjectList(List subjectClassList, String k, Iterator> it) { Tuple2 first = it.next(); if (!Optional.ofNullable(first._1()).isPresent()) { return null; } ResultSubjectList rsl = new ResultSubjectList(); rsl.setResId(k); List sbjInfo = new ArrayList<>(); Set subjectSet = new HashSet<>(); extracted(subjectClassList, first._1().getSubject(), sbjInfo, subjectSet); it.forEachRemaining(t2 -> { if (Optional.ofNullable(t2._1()).isPresent()) extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet); }); rsl.setSubjectList(sbjInfo); return rsl; } private static void extracted(List subjectClassList, List resultSubject, List sbjList, Set subjectSet) { resultSubject .stream() .filter(s -> subjectClassList.contains(s.getQualifier().getClassid().toLowerCase())) .forEach(s -> { if (!subjectSet.contains(s.getValue())) sbjList .add( SubjectInfo .newInstance( s.getQualifier().getClassid(), s.getQualifier().getClassname(), s.getValue())); subjectSet.add(s.getValue()); }); } }