[scholix] fixed OpenCitation dump procedure

This commit is contained in:
Claudio Atzori 2022-08-10 11:57:56 +02:00
parent d85ba3c1a9
commit 51ad93e545
1 changed files with 24 additions and 35 deletions

View File

@ -2,6 +2,7 @@ package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
@ -99,28 +100,11 @@ object SparkConvertRDDtoDataset {
log.info("Converting Relation") log.info("Converting Relation")
if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) {
val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations
.filter(r =>
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
"opencitations".equalsIgnoreCase(k.getValue)
)
)
.filter(r => r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} else {
val relationSemanticFilter = List( val relationSemanticFilter = List(
"merges", ModelConstants.MERGES,
"ismergedin", ModelConstants.IS_MERGED_IN,
"HasAmongTopNSimilarDocuments", ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
"IsAmongTopNSimilarDocuments" ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
) )
val rddRelation = spark.sparkContext val rddRelation = spark.sparkContext
@ -130,13 +114,18 @@ object SparkConvertRDDtoDataset {
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50")) .filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations //filter OpenCitations relations
.filter(r => .filter(r =>
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k => r.getDataInfo.getProvenanceaction != null &&
"opencitations".equalsIgnoreCase(k.getValue) !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
) )
) .filter(r => filterRelations(filterRelation, relationSemanticFilter, r))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} }
private def filterRelations(filterRelation: String, relationSemanticFilter: List[String], r: Relation): Boolean = {
if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) {
r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation)
} else {
!relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
}
} }
} }