53 lines
1.2 KiB
Scala
53 lines
1.2 KiB
Scala
|
package eu.dnetlib.scholix
|
||
|
|
||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||
|
import org.apache.spark.SparkConf
|
||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||
|
import org.slf4j.{Logger, LoggerFactory}
|
||
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
object CheckRelation {
|
||
|
|
||
|
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
|
||
|
|
||
|
|
||
|
def countRelation(path:String, spark: SparkSession ): Unit = {
|
||
|
|
||
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||
|
import spark.implicits._
|
||
|
val df = spark.read.text(path).as[String]
|
||
|
|
||
|
val mapper = new ObjectMapper()
|
||
|
val scholix_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count()
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
logger.warn(s"Total number of relations: ${df.count}")
|
||
|
logger.warn(s"Total number of relations not deleted by Inference: ${scholix_rel}")
|
||
|
|
||
|
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
def main(args: Array[String]): Unit = {
|
||
|
val path = args(0)
|
||
|
val master = args(1)
|
||
|
val conf:SparkConf = new SparkConf()
|
||
|
val spark = SparkSession.builder().config(conf).master(master).getOrCreate()
|
||
|
spark.sparkContext.setLogLevel("WARN")
|
||
|
countRelation(path,spark)
|
||
|
|
||
|
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|