DHP-Explorer/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefStatJob.scala

49 lines
1.3 KiB
Scala

package eu.dnetlib.doiboost.crossref
import com.sandro.app.AbstractScalaApplication
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.count
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods
import org.slf4j.{Logger, LoggerFactory}
class CrossrefStatJob ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def extractTypologies(spark:SparkSession, path:String):Unit = {
import spark.implicits._
val df =spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).distinct()
spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).groupBy("_1", "_2").agg(count("_1").alias("total")).show(200,false)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val path: String = argumentMap.get("path").orNull
if (path == null) throw new MissingArgumentException("Missing argument path")
extractTypologies(spark, path)
}
}
object CrossrefStatJob {
val log: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
new CrossrefStatJob(args = args, log = log).initialize().run()
}
}