package eu.dnetlib.doiboost.crossref import com.sandro.app.AbstractScalaApplication import org.apache.commons.cli.MissingArgumentException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.count import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods import org.slf4j.{Logger, LoggerFactory} class CrossrefStatJob ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) { def extractTypologies(spark:SparkSession, path:String):Unit = { import spark.implicits._ val df =spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).distinct() spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).groupBy("_1", "_2").agg(count("_1").alias("total")).show(200,false) } /** Here all the spark applications runs this method * where the whole logic of the spark node is defined */ override def run(): Unit = { val path: String = argumentMap.get("path").orNull if (path == null) throw new MissingArgumentException("Missing argument path") extractTypologies(spark, path) } } object CrossrefStatJob { val log: Logger = LoggerFactory.getLogger(getClass) def main(args: Array[String]): Unit = { new CrossrefStatJob(args = args, log = log).initialize().run() } }