106 lines
3.1 KiB
Scala
106 lines
3.1 KiB
Scala
package eu.dnetlib.scholix
|
|
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import org.apache.spark.sql.SparkSession
|
|
import org.json4s.DefaultFormats
|
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
def get_type(input:String):String = {
|
|
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: org.json4s.JValue = parse(input)
|
|
|
|
val source = (json \ "source").extractOrElse[String](null)
|
|
if (source != null) {
|
|
val rel =(json \"relClass").extract[String]
|
|
s"Relation"
|
|
}
|
|
else {
|
|
val l: List[String] = for {
|
|
JObject(instance) <- json \\ "instance"
|
|
JField("instancetype", JObject(instancetype)) <- instance
|
|
JField("classname", JString(classname)) <- instancetype
|
|
} yield classname
|
|
l.head
|
|
}
|
|
}
|
|
|
|
def filter_relationId(input:String):List[String] = {
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: org.json4s.JValue = parse(input)
|
|
val source = (json \ "source").extractOrElse[String](null)
|
|
if (source != null) {
|
|
val target =(json \"target").extract[String]
|
|
List(source, target)
|
|
} else
|
|
List()
|
|
}
|
|
|
|
|
|
def filter_entity_id(input:String):(String, String) = {
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: org.json4s.JValue = parse(input)
|
|
val result_type = (json \ "resulttype" \ "classid").extractOrElse[String](null)
|
|
val id = (json \ "id").extractOrElse[String](null)
|
|
if (id == null)
|
|
null
|
|
else
|
|
(id,result_type)
|
|
}
|
|
|
|
|
|
def show_typologies(spark:SparkSession, path:String): Unit = {
|
|
|
|
import spark.implicits._
|
|
val df = spark.read.text(path).as[String]
|
|
|
|
val id_rels = df.flatMap(s => filter_relationId(s))
|
|
.filter(s=>s.startsWith("unresolved::") && s.contains("pmid"))
|
|
.distinct()
|
|
|
|
log.warn(s"Total pubmed pubs imported in scholexplorer ${id_rels.count}")
|
|
// df.map(s =>filter_entity_id(s))
|
|
// .filter(s =>s!=null)
|
|
// .map(_._2)
|
|
// .groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
|
|
|
|
|
|
// val id_datacite = df.map(s =>filter_entity_id(s))
|
|
// .filter(s =>s!=null)
|
|
// .filter(s => "publication".equalsIgnoreCase(s._2))
|
|
// .map(_._1)
|
|
// .distinct()
|
|
//
|
|
// val total_pubs = id_datacite.joinWith(id_rels, id_datacite("value").equalTo(id_rels("value")), "inner").count()
|
|
//
|
|
// log.warn(s"total doi rel in datacite : $total_pubs")
|
|
|
|
}
|
|
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val path = argumentMap("path")
|
|
log.warn(s"Path is $path")
|
|
show_typologies(spark, path)
|
|
}
|
|
}
|
|
|
|
|
|
object CheckMDStoreContent {
|
|
|
|
val log:Logger = LoggerFactory.getLogger(getClass.getName)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new CheckMDStoreContent(args,log).initialize().run()
|
|
}
|
|
}
|