renamed function to check content on mdstore
This commit is contained in:
parent
1977b9e732
commit
0dc33def41
2
pom.xml
2
pom.xml
|
@ -5,7 +5,7 @@
|
|||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.sandro</groupId>
|
||||
<artifactId>ZeppelinNotebook</artifactId>
|
||||
<artifactId>dhp-explorer</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package eu.dnetlib.scholix
|
||||
|
||||
import com.sandro.app.AbstractScalaApplication
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.apache.spark.sql.functions.{count,desc}
|
||||
|
||||
class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
||||
|
||||
|
||||
def get_type(input:String):String = {
|
||||
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
|
||||
val source = (json \ "source").extractOrElse[String](null)
|
||||
if (source != null) {
|
||||
val rel =(json \"relClass").extract[String]
|
||||
s"Relation:$rel"
|
||||
}
|
||||
else {
|
||||
val l: List[String] = for {
|
||||
JObject(instance) <- json \\ "instance"
|
||||
JField("instancetype", JObject(instancetype)) <- instance
|
||||
JField("classname", JString(classname)) <- instancetype
|
||||
} yield classname
|
||||
l.head
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def show_typologies(spark:SparkSession, path:String): Unit = {
|
||||
|
||||
import spark.implicits._
|
||||
val df = spark.read.text(path).as[String]
|
||||
|
||||
df.map(s =>get_type(s)).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val path = argumentMap("path")
|
||||
log.warn(s"Path is $path")
|
||||
show_typologies(spark, path)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
object CheckMDStoreContent {
|
||||
|
||||
val log:Logger = LoggerFactory.getLogger(getClass.getName)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new CheckMDStoreContent(args,log).initialize().run()
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
|||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import eu.dnetlib.scholix.CheckRelation.logger
|
||||
import org.apache.spark.sql.functions.{count, desc}
|
||||
|
||||
class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
||||
|
||||
|
@ -55,13 +56,15 @@ class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaAppl
|
|||
|
||||
|
||||
|
||||
val total_rels_from_scholexplorer = df.map(s=> mapper.readValue(s, classOf[Relation]))
|
||||
df.map(s=> mapper.readValue(s, classOf[Relation]))
|
||||
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
|
||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||
.filter(r => filterRelations(r))
|
||||
.count()
|
||||
.map(r => r.getSubRelType).as[String].groupBy("value")
|
||||
.agg(count("value").alias("Total"))
|
||||
.orderBy(desc("Total"))
|
||||
.show(300, truncate = false)
|
||||
|
||||
logger.warn(s"Relation used by Scholexplorer $total_rels_from_scholexplorer/$not_del_rel")
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue