60 lines
1.6 KiB
Scala
60 lines
1.6 KiB
Scala
package eu.dnetlib.scholix
|
|
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import org.apache.spark.sql.functions.{count, desc}
|
|
import org.apache.spark.sql.{Dataset, SparkSession}
|
|
import org.json4s.DefaultFormats
|
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class CheckEBIStats( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
|
|
def extractPidSchema(input:String) :String = {
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: org.json4s.JValue = parse(input)
|
|
|
|
val source = (json \ "source").extractOrElse[String](null)
|
|
if (source != null) {
|
|
null
|
|
}
|
|
else {
|
|
val l: List[String] = for {
|
|
JObject(pids) <- json \\ "pid"
|
|
JField("qualifier", JObject(qualifier)) <- pids
|
|
JField("classid", JString(classid)) <- qualifier
|
|
} yield classid
|
|
l.head
|
|
}
|
|
}
|
|
|
|
|
|
def listPidType(spark:SparkSession, path:String) :Unit = {
|
|
import spark.implicits._
|
|
val df:Dataset[String] = spark.read.text(path).as[String]
|
|
|
|
|
|
df.map(extractPidSchema).filter(s=> s!=null).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
|
|
|
|
}
|
|
|
|
|
|
override def run(): Unit = {
|
|
val path = argumentMap("path")
|
|
log.warn(s"path is $path")
|
|
listPidType(spark, path)
|
|
}
|
|
}
|
|
|
|
object CheckEBIStats {
|
|
val log: Logger = LoggerFactory.getLogger(getClass.getName)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new CheckEBIStats(args,log).initialize().run()
|
|
}
|
|
|
|
|
|
}
|