46 lines
1.6 KiB
Scala
46 lines
1.6 KiB
Scala
package eu.dnetlib.graph.raw
|
|
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import eu.dnetlib.doiboost.crossref.CrossrefUtils
|
|
import org.apache.commons.cli.MissingArgumentException
|
|
import org.apache.spark.sql.{Dataset, SparkSession}
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
import org.apache.spark.sql.functions.{count, desc}
|
|
|
|
class CheckOpenAireFailure ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
def count_collectedFromByEntity(spark:SparkSession,base_path_1:String,base_path_2:String): Unit = {
|
|
|
|
import spark.implicits._
|
|
// val l_types = List("dataset", "datasource","organization","otherresearchproduct","project","publication","software")
|
|
println(s"Publication in $base_path_1")
|
|
spark.read.text(s"$base_path_1/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100,false)
|
|
println(s"Publication in $base_path_2")
|
|
spark.read.text(s"$base_path_2/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100, false)
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
count_collectedFromByEntity(spark, "/tmp/beta_provision/graph/00_prod_graph_aggregator","/tmp/prod_provision/graph/00_graph_aggregator" )
|
|
}
|
|
}
|
|
|
|
|
|
object CheckOpenAireFailure {
|
|
val log = LoggerFactory.getLogger(getClass)
|
|
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new CheckOpenAireFailure(args, log).initialize().run();
|
|
}
|
|
}
|