package eu.dnetlib.graph.raw import com.sandro.app.AbstractScalaApplication import com.sandro.app.fs.{OAFInfo, OafStat} import eu.dnetlib.scholix.{DHPUtils, Measurement} import org.apache.commons.cli.MissingArgumentException import org.apache.hadoop.io.Text import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.slf4j.{Logger, LoggerFactory} class GenerateMDStoreStats ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def hasHostedBy(o: List[String]): Int = { if (o == null || o.isEmpty) 0 else o.count(h => !h.equalsIgnoreCase("unknown repository")) } def generateMDStoreStats(spark:SparkSession,statsDBPath:String, ts:Long, targetPath:String) :Unit = { import spark.implicits._ val df:Dataset[OAFInfo] = spark.read.load(statsDBPath).as[OAFInfo] log.error("Generating Total Item measurement") df.groupBy("datasourcePrefix").agg(count("datasourcePrefix").as("Total")) .map(r => Measurement(name = "Total Item", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("Total"))) .write.mode(SaveMode.Overwrite).save(targetPath) log.error("Generating Total Item related to a project measurement") df.groupBy("datasourcePrefix").agg(count("projectid").alias("ItemWithProject")) .map(r => { Measurement(name = "Project Relation count", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("ItemWithProject")) }).write.mode(SaveMode.Append).save(targetPath) df.map(o => (o.datasourcePrefix, hasHostedBy(o.hostedBy))) .groupBy("_1") .agg(sum("_2")) .map(r => Measurement("Hosted By Record count", nsprefix = r.getString(0), timestamp = ts, value =r.getAs[Long](1) ) ).write.mode(SaveMode.Append).save(targetPath) } def generateInfoOaf(spark:SparkSession, basePath:String, statsDBPath:String):Unit = { val sc = spark.sparkContext import spark.implicits._ println(s"base Path is $basePath") val mdstores :RDD[OAFInfo] = sc.sequenceFile(basePath, classOf[Text],classOf[Text]).map(x=>x._2.toString).map(x=> DHPUtils.convertTOOAFStat(x)) val df:Dataset[OAFInfo] =spark.createDataset(mdstores) df.write.mode(SaveMode.Overwrite).save(statsDBPath) } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val path:String = argumentMap.get("path").orNull if (path == null) throw new MissingArgumentException("Missing argument path") println(s"base Path is $path") generateInfoOaf(spark, path, "/user/sandro.labruzzo/prod_for_beta_stats") generateMDStoreStats(spark, "/user/sandro.labruzzo/prod_for_beta_stats", System.currentTimeMillis(),"/user/sandro.labruzzo/prod_for_beta_mesaurement") spark.close() } } object GenerateMDStoreStats{ val log: Logger = LoggerFactory.getLogger(GenerateMDStoreStats.getClass) def main(args: Array[String]): Unit = { new GenerateMDStoreStats(args,log ).initialize().run() } }