49 lines
1.3 KiB
Scala
49 lines
1.3 KiB
Scala
package eu.dnetlib.doiboost.crossref
|
|
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import org.apache.commons.cli.MissingArgumentException
|
|
import org.apache.spark.sql.SparkSession
|
|
import org.apache.spark.sql.functions.count
|
|
import org.json4s
|
|
import org.json4s.DefaultFormats
|
|
import org.json4s.jackson.JsonMethods
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class CrossrefStatJob ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
def extractTypologies(spark:SparkSession, path:String):Unit = {
|
|
import spark.implicits._
|
|
|
|
val df =spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).distinct()
|
|
|
|
|
|
spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).groupBy("_1", "_2").agg(count("_1").alias("total")).show(200,false)
|
|
|
|
|
|
|
|
}
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val path: String = argumentMap.get("path").orNull
|
|
if (path == null) throw new MissingArgumentException("Missing argument path")
|
|
extractTypologies(spark, path)
|
|
|
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
object CrossrefStatJob {
|
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new CrossrefStatJob(args = args, log = log).initialize().run()
|
|
}
|
|
}
|