Changed the way Scholexplorer filter the relationships, I found that filter all relation coming from openCitation is wrong, because we loose a lot of relation than intersect OpenCitation, but they don't come only from there

This commit is contained in:
Sandro La Bruzzo 2022-10-24 12:53:47 +02:00
parent ec39b84898
commit 2b9a20a4a3
1 changed files with 34 additions and 43 deletions

View File

@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset {
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => filterRelations(subRelTypeFilter, relClassFilter, r))
.filter(r => filterRelations(r))
//filter OpenCitations relations
.filter(r =>
r.getDataInfo.getProvenanceaction != null &&
!"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
)
// .filter(r =>
// r.getDataInfo.getProvenanceaction != null &&
// !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
// )
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
}
private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = {
if (StringUtils.isNotBlank(subRelTypeFilter)) {
subRelTypeFilter.equalsIgnoreCase(r.getSubRelType)
} else {
!relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
}
}
private def filterRelations(r: Relation): Boolean = {
/*
//TODO: finalise implementation
private def processResult[T<: Result](
implicit ct: ClassTag[T],
log: Logger,
spark: SparkSession,
sourcePath: String,
entityPath: String,
clazz: Class[T]
): Unit = {
val entityType = clazz.getSimpleName.toLowerCase
log.info(s"Converting $entityType")
val mapper = new ObjectMapper() with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
val rdd = spark.sparkContext
.textFile(s"$sourcePath/$entityType")
.map(s => mapper.readValue(s, clazz))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference);
implicit val encoder: Encoder[T] = Encoders.kryo(clazz)
spark
.createDataset(rdd)
.as[T]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/$entityType")
}
/** *
* We filter relation generated by dedups
* and all the relation that have one single collectedFrom OpenCitation
*/
val relClassFilter = List(
ModelConstants.MERGES,
ModelConstants.IS_MERGED_IN,
ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
)
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
false
else {
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
false
else if (r.getCollectedfrom.size() > 1)
true
else if (
r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase(
r.getCollectedfrom.get(0).getValue
)
)
false
else
true
}
}
}