DHP-Explorer/src/main/java/eu/dnetlib/graph/raw/CheckOpenAireFailure.scala

46 lines
1.6 KiB
Scala

package eu.dnetlib.graph.raw
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.doiboost.crossref.CrossrefUtils
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.{Dataset, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions.{count, desc}
class CheckOpenAireFailure ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def count_collectedFromByEntity(spark:SparkSession,base_path_1:String,base_path_2:String): Unit = {
import spark.implicits._
// val l_types = List("dataset", "datasource","organization","otherresearchproduct","project","publication","software")
println(s"Publication in $base_path_1")
spark.read.text(s"$base_path_1/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100,false)
println(s"Publication in $base_path_2")
spark.read.text(s"$base_path_2/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100, false)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
count_collectedFromByEntity(spark, "/tmp/beta_provision/graph/00_prod_graph_aggregator","/tmp/prod_provision/graph/00_graph_aggregator" )
}
}
object CheckOpenAireFailure {
val log = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
new CheckOpenAireFailure(args, log).initialize().run();
}
}