DHP-Explorer/src/main/java/eu/dnetlib/scholix/CheckRelation.scala

53 lines
1.2 KiB
Scala
Raw Normal View History

2022-10-18 15:08:06 +02:00
package eu.dnetlib.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.schema.oaf.Relation
object CheckRelation {
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
def countRelation(path:String, spark: SparkSession ): Unit = {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val df = spark.read.text(path).as[String]
val mapper = new ObjectMapper()
val scholix_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count()
logger.warn(s"Total number of relations: ${df.count}")
logger.warn(s"Total number of relations not deleted by Inference: ${scholix_rel}")
}
def main(args: Array[String]): Unit = {
val path = args(0)
val master = args(1)
val conf:SparkConf = new SparkConf()
val spark = SparkSession.builder().config(conf).master(master).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
countRelation(path,spark)
}
}