package eu.dnetlib.graph.raw import com.sandro.app.AbstractScalaApplication import eu.dnetlib.doiboost.crossref.CrossrefUtils import org.apache.commons.cli.MissingArgumentException import org.apache.spark.sql.{Dataset, SparkSession} import org.slf4j.{Logger, LoggerFactory} import org.apache.spark.sql.functions.{count, desc} class CheckOpenAireFailure ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def count_collectedFromByEntity(spark:SparkSession,base_path_1:String,base_path_2:String): Unit = { import spark.implicits._ // val l_types = List("dataset", "datasource","organization","otherresearchproduct","project","publication","software") println(s"Publication in $base_path_1") spark.read.text(s"$base_path_1/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100,false) println(s"Publication in $base_path_2") spark.read.text(s"$base_path_2/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100, false) } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { count_collectedFromByEntity(spark, "/tmp/beta_provision/graph/00_prod_graph_aggregator","/tmp/prod_provision/graph/00_graph_aggregator" ) } } object CheckOpenAireFailure { val log = LoggerFactory.getLogger(getClass) def main(args: Array[String]): Unit = { new CheckOpenAireFailure(args, log).initialize().run(); } }