97 lines
3.3 KiB
Scala
97 lines
3.3 KiB
Scala
package eu.dnetlib.scholix
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
|
import com.sandro.app.AbstractScalaApplication
|
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
|
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
|
import eu.dnetlib.scholix.CheckRelation.logger
|
|
import org.apache.spark.sql.functions.count
|
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
|
import org.json4s.DefaultFormats
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
class CheckSummaries ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
|
|
|
|
|
def filterRelations(r: Relation): Boolean = {
|
|
val relClassFilter = List(
|
|
"merges",
|
|
"isMergedIn",
|
|
"HasAmongTopNSimilarDocuments",
|
|
"IsAmongTopNSimilarDocuments"
|
|
)
|
|
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
|
false
|
|
else {
|
|
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
|
|
false
|
|
else if (r.getCollectedfrom.size() > 1)
|
|
true
|
|
else if (r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0)!=null && "OpenCitations".equalsIgnoreCase(r.getCollectedfrom.get(0).getValue))
|
|
false
|
|
else
|
|
true
|
|
}
|
|
}
|
|
|
|
def extractSourceTarget(input:String, path:String) :String = {
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: org.json4s.JValue = parse(input)
|
|
(json \ path).extract[String]
|
|
}
|
|
|
|
def countSummaries(basePath:String, spark:SparkSession) :Unit = {
|
|
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
|
implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity]
|
|
import spark.implicits._
|
|
|
|
val relPath = s"/tmp/beta_provision/scholix/relation"
|
|
|
|
val pubPath = s"$basePath/entities/publication"
|
|
|
|
|
|
|
|
// val ds:Dataset[ScholixSummary] = spark.read.load(path).as[ScholixSummary]
|
|
//
|
|
//
|
|
// ds.map(s => s.getTypology.toString).groupBy("value").agg(count("value").alias("Total")).show(300, truncate = false)
|
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
val df =spark.read.load(relPath).as[Relation]
|
|
|
|
val totalIDS = df.flatMap(r=> List(r.getSource,r.getTarget))
|
|
.filter(s => s.startsWith("50"))
|
|
.distinct()
|
|
|
|
val pubId = spark.read.load(pubPath).as[OafEntity].map(o =>o.getId).distinct()
|
|
|
|
val idPubsTotal = pubId.joinWith(totalIDS, pubId("value").equalTo(totalIDS("value")), "inner").count()
|
|
|
|
log.warn(s"Total ids in input Relation of type publication $idPubsTotal")
|
|
|
|
}
|
|
|
|
|
|
/** Here all the spark applications runs this method
|
|
* where the whole logic of the spark node is defined
|
|
*/
|
|
override def run(): Unit = {
|
|
val path = argumentMap("path")
|
|
logger.warn(s"path properties is $path")
|
|
if (path == null || path.isEmpty)
|
|
throw new IllegalArgumentException("missing path arguments.properties -path when launch file, check if it is inside the arguments.properties")
|
|
countSummaries(path, spark)
|
|
}
|
|
}
|
|
|
|
object CheckSummaries {
|
|
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
new CheckSummaries(args,logger).initialize().run()
|
|
}
|
|
|
|
}
|