DHP-Explorer/src/main/java/eu/dnetlib/graph/raw/GenerateMDStoreStats.scala

87 lines
3.1 KiB
Scala

package eu.dnetlib.graph.raw
import com.sandro.app.AbstractScalaApplication
import com.sandro.app.fs.{OAFInfo, OafStat}
import eu.dnetlib.scholix.{DHPUtils, Measurement}
import org.apache.commons.cli.MissingArgumentException
import org.apache.hadoop.io.Text
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.slf4j.{Logger, LoggerFactory}
class GenerateMDStoreStats ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def hasHostedBy(o: List[String]): Int = {
if (o == null || o.isEmpty)
0
else
o.count(h => !h.equalsIgnoreCase("unknown repository"))
}
def generateMDStoreStats(spark:SparkSession,statsDBPath:String, ts:Long, targetPath:String) :Unit = {
import spark.implicits._
val df:Dataset[OAFInfo] = spark.read.load(statsDBPath).as[OAFInfo]
log.error("Generating Total Item measurement")
df.groupBy("datasourcePrefix").agg(count("datasourcePrefix").as("Total"))
.map(r => Measurement(name = "Total Item", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("Total")))
.write.mode(SaveMode.Overwrite).save(targetPath)
log.error("Generating Total Item related to a project measurement")
df.groupBy("datasourcePrefix").agg(count("projectid").alias("ItemWithProject"))
.map(r => {
Measurement(name = "Project Relation count", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("ItemWithProject"))
}).write.mode(SaveMode.Append).save(targetPath)
df.map(o => (o.datasourcePrefix, hasHostedBy(o.hostedBy)))
.groupBy("_1")
.agg(sum("_2"))
.map(r =>
Measurement("Hosted By Record count", nsprefix = r.getString(0), timestamp = ts, value =r.getAs[Long](1) )
).write.mode(SaveMode.Append).save(targetPath)
}
def generateInfoOaf(spark:SparkSession, basePath:String, statsDBPath:String):Unit = {
val sc = spark.sparkContext
import spark.implicits._
println(s"base Path is $basePath")
val mdstores :RDD[OAFInfo] = sc.sequenceFile(basePath, classOf[Text],classOf[Text]).map(x=>x._2.toString).map(x=> DHPUtils.convertTOOAFStat(x))
val df:Dataset[OAFInfo] =spark.createDataset(mdstores)
df.write.mode(SaveMode.Overwrite).save(statsDBPath)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val path:String = argumentMap.get("path").orNull
if (path == null) throw new MissingArgumentException("Missing argument path")
println(s"base Path is $path")
generateInfoOaf(spark, path, "/user/sandro.labruzzo/prod_for_beta_stats")
generateMDStoreStats(spark, "/user/sandro.labruzzo/prod_for_beta_stats", System.currentTimeMillis(),"/user/sandro.labruzzo/prod_for_beta_mesaurement")
spark.close()
}
}
object GenerateMDStoreStats{
val log: Logger = LoggerFactory.getLogger(GenerateMDStoreStats.getClass)
def main(args: Array[String]): Unit = {
new GenerateMDStoreStats(args,log ).initialize().run()
}
}