DHP-Explorer/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala

package eu.dnetlib.scholix

import com.sandro.app.AbstractScalaApplication
import org.apache.spark.sql.SparkSession
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}

class CheckMDStoreContent( args: Array[String], log: Logger)  extends AbstractScalaApplication( args: Array[String], log: Logger) {


  def get_type(input:String):String = {


    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)

    val source = (json \ "source").extractOrElse[String](null)
    if (source != null) {
      val rel =(json \"relClass").extract[String]
      s"Relation"
    }
    else {
      val l: List[String] = for {
        JObject(instance) <- json \\ "instance"
        JField("instancetype", JObject(instancetype)) <- instance
        JField("classname", JString(classname)) <- instancetype
      } yield classname
      l.head
    }
  }

  def filter_relationId(input:String):List[String] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)
    val source = (json \ "source").extractOrElse[String](null)
    if (source != null) {
      val target =(json \"target").extract[String]
      List(source, target)
    } else
    List()
  }


  def filter_entity_id(input:String):(String, String) = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: org.json4s.JValue = parse(input)
    val result_type = (json \ "resulttype" \ "classid").extractOrElse[String](null)
    val id = (json \ "id").extractOrElse[String](null)
    if (id == null)
      null
    else
      (id,result_type)
  }


  def show_typologies(spark:SparkSession, path:String): Unit = {

    import spark.implicits._
    val df = spark.read.text(path).as[String]

    val id_rels = df.flatMap(s => filter_relationId(s))
        .filter(s=>s.startsWith("unresolved::") && s.contains("pmid"))
        .distinct()

    log.warn(s"Total pubmed pubs imported in scholexplorer ${id_rels.count}")
//    df.map(s =>filter_entity_id(s))
//      .filter(s =>s!=null)
//      .map(_._2)
//      .groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)


//    val id_datacite = df.map(s =>filter_entity_id(s))
//      .filter(s =>s!=null)
//      .filter(s => "publication".equalsIgnoreCase(s._2))
//      .map(_._1)
//      .distinct()
//
//    val total_pubs = id_datacite.joinWith(id_rels, id_datacite("value").equalTo(id_rels("value")), "inner").count()
//
//    log.warn(s"total doi rel in datacite : $total_pubs")

  }


  /** Here all the spark applications runs this method
   * where the whole logic of the spark node is defined
   */
  override def run(): Unit = {
    val path = argumentMap("path")
    log.warn(s"Path is $path")
    show_typologies(spark, path)
  }
}


object CheckMDStoreContent {

  val log:Logger = LoggerFactory.getLogger(getClass.getName)

  def main(args: Array[String]): Unit = {
    new CheckMDStoreContent(args,log).initialize().run()
  }
}