diff --git a/pom.xml b/pom.xml
index 39b659b..96a28b0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
4.0.0
com.sandro
- ZeppelinNotebook
+ dhp-explorer
1.0-SNAPSHOT
diff --git a/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala b/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala
new file mode 100644
index 0000000..8539e74
--- /dev/null
+++ b/src/main/java/eu/dnetlib/scholix/CheckMDStoreContent.scala
@@ -0,0 +1,64 @@
+package eu.dnetlib.scholix
+
+import com.sandro.app.AbstractScalaApplication
+import org.apache.spark.sql.SparkSession
+import org.slf4j.{Logger, LoggerFactory}
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.{JField, JObject, JString}
+import org.json4s.jackson.JsonMethods.parse
+import org.apache.spark.sql.functions.{count,desc}
+
+class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
+
+
+ def get_type(input:String):String = {
+
+
+ implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
+ lazy val json: org.json4s.JValue = parse(input)
+
+ val source = (json \ "source").extractOrElse[String](null)
+ if (source != null) {
+ val rel =(json \"relClass").extract[String]
+ s"Relation:$rel"
+ }
+ else {
+ val l: List[String] = for {
+ JObject(instance) <- json \\ "instance"
+ JField("instancetype", JObject(instancetype)) <- instance
+ JField("classname", JString(classname)) <- instancetype
+ } yield classname
+ l.head
+ }
+ }
+
+
+ def show_typologies(spark:SparkSession, path:String): Unit = {
+
+ import spark.implicits._
+ val df = spark.read.text(path).as[String]
+
+ df.map(s =>get_type(s)).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
+
+ }
+
+
+ /** Here all the spark applications runs this method
+ * where the whole logic of the spark node is defined
+ */
+ override def run(): Unit = {
+ val path = argumentMap("path")
+ log.warn(s"Path is $path")
+ show_typologies(spark, path)
+ }
+}
+
+
+object CheckMDStoreContent {
+
+ val log:Logger = LoggerFactory.getLogger(getClass.getName)
+
+ def main(args: Array[String]): Unit = {
+ new CheckMDStoreContent(args,log).initialize().run()
+ }
+}
diff --git a/src/main/java/eu/dnetlib/scholix/CheckRelation.scala b/src/main/java/eu/dnetlib/scholix/CheckRelation.scala
index b5963a3..89943ee 100644
--- a/src/main/java/eu/dnetlib/scholix/CheckRelation.scala
+++ b/src/main/java/eu/dnetlib/scholix/CheckRelation.scala
@@ -6,6 +6,7 @@ import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.scholix.CheckRelation.logger
+import org.apache.spark.sql.functions.{count, desc}
class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
@@ -55,13 +56,15 @@ class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaAppl
- val total_rels_from_scholexplorer = df.map(s=> mapper.readValue(s, classOf[Relation]))
+ df.map(s=> mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => filterRelations(r))
- .count()
+ .map(r => r.getSubRelType).as[String].groupBy("value")
+ .agg(count("value").alias("Total"))
+ .orderBy(desc("Total"))
+ .show(300, truncate = false)
- logger.warn(s"Relation used by Scholexplorer $total_rels_from_scholexplorer/$not_del_rel")
}