87 lines
3.1 KiB
Scala
87 lines
3.1 KiB
Scala
package eu.dnetlib.graph.raw
|
|
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import com.sandro.app.fs.{OAFInfo, OafStat}
|
|
import eu.dnetlib.scholix.{DHPUtils, Measurement}
|
|
import org.apache.commons.cli.MissingArgumentException
|
|
import org.apache.hadoop.io.Text
|
|
import org.apache.spark.rdd.RDD
|
|
import org.apache.spark.sql._
|
|
import org.apache.spark.sql.functions._
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class GenerateMDStoreStats ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
|
|
def hasHostedBy(o: List[String]): Int = {
|
|
if (o == null || o.isEmpty)
|
|
0
|
|
else
|
|
o.count(h => !h.equalsIgnoreCase("unknown repository"))
|
|
}
|
|
|
|
def generateMDStoreStats(spark:SparkSession,statsDBPath:String, ts:Long, targetPath:String) :Unit = {
|
|
import spark.implicits._
|
|
val df:Dataset[OAFInfo] = spark.read.load(statsDBPath).as[OAFInfo]
|
|
|
|
log.error("Generating Total Item measurement")
|
|
df.groupBy("datasourcePrefix").agg(count("datasourcePrefix").as("Total"))
|
|
.map(r => Measurement(name = "Total Item", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("Total")))
|
|
.write.mode(SaveMode.Overwrite).save(targetPath)
|
|
|
|
log.error("Generating Total Item related to a project measurement")
|
|
df.groupBy("datasourcePrefix").agg(count("projectid").alias("ItemWithProject"))
|
|
.map(r => {
|
|
Measurement(name = "Project Relation count", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("ItemWithProject"))
|
|
}).write.mode(SaveMode.Append).save(targetPath)
|
|
|
|
df.map(o => (o.datasourcePrefix, hasHostedBy(o.hostedBy)))
|
|
.groupBy("_1")
|
|
.agg(sum("_2"))
|
|
.map(r =>
|
|
Measurement("Hosted By Record count", nsprefix = r.getString(0), timestamp = ts, value =r.getAs[Long](1) )
|
|
).write.mode(SaveMode.Append).save(targetPath)
|
|
|
|
}
|
|
|
|
|
|
def generateInfoOaf(spark:SparkSession, basePath:String, statsDBPath:String):Unit = {
|
|
val sc = spark.sparkContext
|
|
import spark.implicits._
|
|
|
|
println(s"base Path is $basePath")
|
|
val mdstores :RDD[OAFInfo] = sc.sequenceFile(basePath, classOf[Text],classOf[Text]).map(x=>x._2.toString).map(x=> DHPUtils.convertTOOAFStat(x))
|
|
val df:Dataset[OAFInfo] =spark.createDataset(mdstores)
|
|
df.write.mode(SaveMode.Overwrite).save(statsDBPath)
|
|
|
|
|
|
}
|
|
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val path:String = argumentMap.get("path").orNull
|
|
if (path == null) throw new MissingArgumentException("Missing argument path")
|
|
println(s"base Path is $path")
|
|
generateInfoOaf(spark, path, "/user/sandro.labruzzo/prod_for_beta_stats")
|
|
generateMDStoreStats(spark, "/user/sandro.labruzzo/prod_for_beta_stats", System.currentTimeMillis(),"/user/sandro.labruzzo/prod_for_beta_mesaurement")
|
|
spark.close()
|
|
}
|
|
}
|
|
|
|
|
|
object GenerateMDStoreStats{
|
|
|
|
val log: Logger = LoggerFactory.getLogger(GenerateMDStoreStats.getClass)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
|
|
new GenerateMDStoreStats(args,log ).initialize().run()
|
|
|
|
}
|
|
|
|
}
|