package eu.dnetlib.scholix import com.sandro.app.AbstractScalaApplication import org.apache.spark.sql.SparkSession import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def get_type(input:String):String = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val source = (json \ "source").extractOrElse[String](null) if (source != null) { val rel =(json \"relClass").extract[String] s"Relation" } else { val l: List[String] = for { JObject(instance) <- json \\ "instance" JField("instancetype", JObject(instancetype)) <- instance JField("classname", JString(classname)) <- instancetype } yield classname l.head } } def filter_relationId(input:String):List[String] = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val source = (json \ "source").extractOrElse[String](null) if (source != null) { val target =(json \"target").extract[String] List(source, target) } else List() } def filter_entity_id(input:String):(String, String) = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val result_type = (json \ "resulttype" \ "classid").extractOrElse[String](null) val id = (json \ "id").extractOrElse[String](null) if (id == null) null else (id,result_type) } def show_typologies(spark:SparkSession, path:String): Unit = { import spark.implicits._ val df = spark.read.text(path).as[String] val id_rels = df.flatMap(s => filter_relationId(s)) .filter(s=>s.startsWith("unresolved::") && s.contains("pmid")) .distinct() log.warn(s"Total pubmed pubs imported in scholexplorer ${id_rels.count}") // df.map(s =>filter_entity_id(s)) // .filter(s =>s!=null) // .map(_._2) // .groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false) // val id_datacite = df.map(s =>filter_entity_id(s)) // .filter(s =>s!=null) // .filter(s => "publication".equalsIgnoreCase(s._2)) // .map(_._1) // .distinct() // // val total_pubs = id_datacite.joinWith(id_rels, id_datacite("value").equalTo(id_rels("value")), "inner").count() // // log.warn(s"total doi rel in datacite : $total_pubs") } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val path = argumentMap("path") log.warn(s"Path is $path") show_typologies(spark, path) } } object CheckMDStoreContent { val log:Logger = LoggerFactory.getLogger(getClass.getName) def main(args: Array[String]): Unit = { new CheckMDStoreContent(args,log).initialize().run() } }