package eu.dnetlib.scholix import com.fasterxml.jackson.databind.ObjectMapper import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} import eu.dnetlib.dhp.schema.oaf.Relation object CheckRelation { val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName) def countRelation(path:String, spark: SparkSession ): Unit = { implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation] import spark.implicits._ val df = spark.read.text(path).as[String] val mapper = new ObjectMapper() val scholix_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count() logger.warn(s"Total number of relations: ${df.count}") logger.warn(s"Total number of relations not deleted by Inference: ${scholix_rel}") } def main(args: Array[String]): Unit = { val path = args(0) val master = args(1) val conf:SparkConf = new SparkConf() val spark = SparkSession.builder().config(conf).master(master).getOrCreate() spark.sparkContext.setLogLevel("WARN") countRelation(path,spark) } }