[stats wf] indicators across stats dbs & updates in the org ids #248

Closed
dimitris.pierrakos wants to merge 1742 commits from beta into beta2master_sept_2022
1 changed files with 34 additions and 43 deletions
Showing only changes of commit 2b9a20a4a3 - Show all commits

View File

@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset {
.map(s => mapper.readValue(s, classOf[Relation])) .map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference) .filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => filterRelations(subRelTypeFilter, relClassFilter, r)) .filter(r => filterRelations(r))
//filter OpenCitations relations //filter OpenCitations relations
.filter(r => // .filter(r =>
r.getDataInfo.getProvenanceaction != null && // r.getDataInfo.getProvenanceaction != null &&
!"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid) // !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
) // )
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} }
private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = { private def filterRelations(r: Relation): Boolean = {
if (StringUtils.isNotBlank(subRelTypeFilter)) {
subRelTypeFilter.equalsIgnoreCase(r.getSubRelType)
} else {
!relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
}
}
/* /** *
//TODO: finalise implementation * We filter relation generated by dedups
private def processResult[T<: Result]( * and all the relation that have one single collectedFrom OpenCitation
implicit ct: ClassTag[T],
log: Logger,
spark: SparkSession,
sourcePath: String,
entityPath: String,
clazz: Class[T]
): Unit = {
val entityType = clazz.getSimpleName.toLowerCase
log.info(s"Converting $entityType")
val mapper = new ObjectMapper() with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
val rdd = spark.sparkContext
.textFile(s"$sourcePath/$entityType")
.map(s => mapper.readValue(s, clazz))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference);
implicit val encoder: Encoder[T] = Encoders.kryo(clazz)
spark
.createDataset(rdd)
.as[T]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/$entityType")
}
*/ */
val relClassFilter = List(
ModelConstants.MERGES,
ModelConstants.IS_MERGED_IN,
ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
)
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
false
else {
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
false
else if (r.getCollectedfrom.size() > 1)
true
else if (
r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase(
r.getCollectedfrom.get(0).getValue
)
)
false
else
true
}
}
} }