package eu.dnetlib.scholix import com.fasterxml.jackson.databind.ObjectMapper import com.sandro.app.AbstractScalaApplication import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.scholix.CheckRelation.logger import org.apache.spark.sql.functions.{count, desc} class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def filterRelations(r: Relation): Boolean = { val relClassFilter = List( "merges", "isMergedIn", "HasAmongTopNSimilarDocuments", "IsAmongTopNSimilarDocuments" ) if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) false else { if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0) false else if (r.getCollectedfrom.size() > 1) true else if (r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0).getValue.equalsIgnoreCase("OpenCitations")) false else true } } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val path = argumentMap("path") logger.warn(s"path properties is $path") if (path == null || path.isEmpty) throw new IllegalArgumentException("missing path arguments.properties -path when launch file, check if it is inside the arguments.properties") countRelation(path, spark) } def countRelation(path:String, spark: SparkSession ): Unit = { implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ val df = spark.read.text(path).as[String] val mapper = new ObjectMapper() val total = df.count val not_del_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count() logger.warn(s"Total number of relations not deleted by Inference: $not_del_rel/$total") df.map(s=> mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => filterRelations(r)) .map(r => r.getRelClass).as[String].groupBy("value") .agg(count("value").alias("Total")) .orderBy(desc("Total")) .show(300, truncate = false) } } object CheckRelation { val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName) def main(args: Array[String]): Unit = { new CheckRelation(args,logger).initialize().run() } }