DHP-Explorer/src/main/java/eu/dnetlib/scholix/CheckEBIStats.scala

60 lines
1.6 KiB
Scala

package eu.dnetlib.scholix
import com.sandro.app.AbstractScalaApplication
import org.apache.spark.sql.functions.{count, desc}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
class CheckEBIStats( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def extractPidSchema(input:String) :String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val source = (json \ "source").extractOrElse[String](null)
if (source != null) {
null
}
else {
val l: List[String] = for {
JObject(pids) <- json \\ "pid"
JField("qualifier", JObject(qualifier)) <- pids
JField("classid", JString(classid)) <- qualifier
} yield classid
l.head
}
}
def listPidType(spark:SparkSession, path:String) :Unit = {
import spark.implicits._
val df:Dataset[String] = spark.read.text(path).as[String]
df.map(extractPidSchema).filter(s=> s!=null).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
}
override def run(): Unit = {
val path = argumentMap("path")
log.warn(s"path is $path")
listPidType(spark, path)
}
}
object CheckEBIStats {
val log: Logger = LoggerFactory.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
new CheckEBIStats(args,log).initialize().run()
}
}