package eu.dnetlib.scholix import com.fasterxml.jackson.databind.ObjectMapper import com.sandro.app.AbstractScalaApplication import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.scholix.CheckRelation.logger import org.apache.spark.sql.functions.count import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} class CheckSummaries ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def filterRelations(r: Relation): Boolean = { val relClassFilter = List( "merges", "isMergedIn", "HasAmongTopNSimilarDocuments", "IsAmongTopNSimilarDocuments" ) if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) false else { if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0) false else if (r.getCollectedfrom.size() > 1) true else if (r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0)!=null && "OpenCitations".equalsIgnoreCase(r.getCollectedfrom.get(0).getValue)) false else true } } def extractSourceTarget(input:String, path:String) :String = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) (json \ path).extract[String] } def countSummaries(basePath:String, spark:SparkSession) :Unit = { implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity] import spark.implicits._ val relPath = s"/tmp/beta_provision/scholix/relation" val pubPath = s"$basePath/entities/publication" // val ds:Dataset[ScholixSummary] = spark.read.load(path).as[ScholixSummary] // // // ds.map(s => s.getTypology.toString).groupBy("value").agg(count("value").alias("Total")).show(300, truncate = false) val mapper = new ObjectMapper() val df =spark.read.load(relPath).as[Relation] val totalIDS = df.flatMap(r=> List(r.getSource,r.getTarget)) .filter(s => s.startsWith("50")) .distinct() val pubId = spark.read.load(pubPath).as[OafEntity].map(o =>o.getId).distinct() val idPubsTotal = pubId.joinWith(totalIDS, pubId("value").equalTo(totalIDS("value")), "inner").count() log.warn(s"Total ids in input Relation of type publication $idPubsTotal") } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val path = argumentMap("path") logger.warn(s"path properties is $path") if (path == null || path.isEmpty) throw new IllegalArgumentException("missing path arguments.properties -path when launch file, check if it is inside the arguments.properties") countSummaries(path, spark) } } object CheckSummaries { val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName) def main(args: Array[String]): Unit = { new CheckSummaries(args,logger).initialize().run() } }