2022-10-18 15:08:06 +02:00
|
|
|
package eu.dnetlib.scholix
|
|
|
|
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
2022-10-21 11:01:31 +02:00
|
|
|
import com.sandro.app.AbstractScalaApplication
|
2022-10-18 15:08:06 +02:00
|
|
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
|
|
|
import org.slf4j.{Logger, LoggerFactory}
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
2022-10-21 11:01:31 +02:00
|
|
|
import eu.dnetlib.scholix.CheckRelation.logger
|
|
|
|
|
2022-10-21 11:07:16 +02:00
|
|
|
class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
|
2022-10-21 11:01:31 +02:00
|
|
|
|
|
|
|
def filterRelations(r: Relation): Boolean = {
|
|
|
|
val relClassFilter = List(
|
|
|
|
"merges",
|
|
|
|
"isMergedIn",
|
|
|
|
"HasAmongTopNSimilarDocuments",
|
|
|
|
"IsAmongTopNSimilarDocuments"
|
|
|
|
)
|
|
|
|
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
|
|
|
false
|
|
|
|
else {
|
|
|
|
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
|
|
|
|
false
|
|
|
|
else if (r.getCollectedfrom.size() > 1)
|
|
|
|
true
|
|
|
|
else if (r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0).getValue.equalsIgnoreCase("OpenCitations"))
|
|
|
|
false
|
|
|
|
else
|
|
|
|
true
|
|
|
|
}
|
|
|
|
}
|
2022-10-18 15:08:06 +02:00
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
/** Here all the spark applications runs this method
|
|
|
|
* where the whole logic of the spark node is defined
|
|
|
|
*/
|
|
|
|
override def run(): Unit = {
|
|
|
|
val path = argumentMap("path")
|
|
|
|
logger.warn(s"path properties is $path")
|
|
|
|
if (path == null || path.isEmpty)
|
|
|
|
throw new IllegalArgumentException("missing path arguments.properties -path when launch file, check if it is inside the arguments.properties")
|
|
|
|
countRelation(path, spark)
|
|
|
|
}
|
2022-10-18 15:08:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
def countRelation(path:String, spark: SparkSession ): Unit = {
|
|
|
|
|
|
|
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
|
|
|
import spark.implicits._
|
|
|
|
val df = spark.read.text(path).as[String]
|
|
|
|
|
|
|
|
val mapper = new ObjectMapper()
|
2022-10-21 11:01:31 +02:00
|
|
|
// val total = df.count
|
|
|
|
// val not_del_rel =df.map(s=> mapper.readValue(s, classOf[Relation])).as[Relation].filter(r => r.getDataInfo.getDeletedbyinference == false).count()
|
|
|
|
// logger.warn(s"Total number of relations not deleted by Inference: $not_del_rel/$total")
|
2022-10-18 15:08:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
val total_rels_from_scholexplorer = df.map(s=> mapper.readValue(s, classOf[Relation]))
|
|
|
|
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
|
|
|
|
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
2022-10-21 11:07:16 +02:00
|
|
|
.filter(r => filterRelations(r))
|
2022-10-21 11:01:31 +02:00
|
|
|
.count()
|
2022-10-18 15:08:06 +02:00
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
logger.warn(s"Relation used by Scholexplorer $total_rels_from_scholexplorer")
|
2022-10-18 15:08:06 +02:00
|
|
|
}
|
2022-10-21 11:07:16 +02:00
|
|
|
|
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
}
|
2022-10-18 15:08:06 +02:00
|
|
|
|
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
object CheckRelation {
|
2022-10-18 15:08:06 +02:00
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
|
2022-10-18 15:08:06 +02:00
|
|
|
|
2022-10-21 11:01:31 +02:00
|
|
|
def main(args: Array[String]): Unit = {
|
|
|
|
new CheckRelation(args,logger).initialize().run()
|
2022-10-18 15:08:06 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|