diff --git a/pom.xml b/pom.xml index 39b659b..96a28b0 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.sandro - ZeppelinNotebook + dhp-explorer 1.0-SNAPSHOT diff --git a/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala b/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala new file mode 100644 index 0000000..8539e74 --- /dev/null +++ b/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala @@ -0,0 +1,64 @@ +package eu.dnetlib.scholix + +import com.sandro.app.AbstractScalaApplication +import org.apache.spark.sql.SparkSession +import org.slf4j.{Logger, LoggerFactory} +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse +import org.apache.spark.sql.functions.{count,desc} + +class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { + + + def get_type(input:String):String = { + + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + + val source = (json \ "source").extractOrElse[String](null) + if (source != null) { + val rel =(json \"relClass").extract[String] + s"Relation:$rel" + } + else { + val l: List[String] = for { + JObject(instance) <- json \\ "instance" + JField("instancetype", JObject(instancetype)) <- instance + JField("classname", JString(classname)) <- instancetype + } yield classname + l.head + } + } + + + def show_typologies(spark:SparkSession, path:String): Unit = { + + import spark.implicits._ + val df = spark.read.text(path).as[String] + + df.map(s =>get_type(s)).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false) + + } + + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + val path = argumentMap("path") + log.warn(s"Path is $path") + show_typologies(spark, path) + } +} + + +object CheckMDStoreContent { + + val log:Logger = LoggerFactory.getLogger(getClass.getName) + + def main(args: Array[String]): Unit = { + new CheckMDStoreContent(args,log).initialize().run() + } +} diff --git a/src/main/java/eu/dnetlib/scholix/CheckRelation.scala b/src/main/java/eu/dnetlib/scholix/CheckRelation.scala index b5963a3..89943ee 100644 --- a/src/main/java/eu/dnetlib/scholix/CheckRelation.scala +++ b/src/main/java/eu/dnetlib/scholix/CheckRelation.scala @@ -6,6 +6,7 @@ import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.scholix.CheckRelation.logger +import org.apache.spark.sql.functions.{count, desc} class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { @@ -55,13 +56,15 @@ class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaAppl - val total_rels_from_scholexplorer = df.map(s=> mapper.readValue(s, classOf[Relation])) + df.map(s=> mapper.readValue(s, classOf[Relation])) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => filterRelations(r)) - .count() + .map(r => r.getSubRelType).as[String].groupBy("value") + .agg(count("value").alias("Total")) + .orderBy(desc("Total")) + .show(300, truncate = false) - logger.warn(s"Relation used by Scholexplorer $total_rels_from_scholexplorer/$not_del_rel") }