Modification of Microsoft Academic Graph Mapping #435

Merged
claudio.atzori merged 1 commits from mag_only_doi into beta 2024-05-21 13:48:42 +02:00
3 changed files with 22 additions and 34 deletions

View File

@ -79,23 +79,6 @@ object MagUtility extends Serializable {
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME) private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
private val MAGDataInfo: DataInfo = { private val MAGDataInfo: DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust("0.9")
di.setProvenanceaction(
OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS
)
)
di
}
private val MAGDataInfoInvisible: DataInfo = {
val di = new DataInfo val di = new DataInfo
di.setDeletedbyinference(false) di.setDeletedbyinference(false)
di.setInferred(false) di.setInferred(false)
@ -111,8 +94,7 @@ object MagUtility extends Serializable {
) )
di di
} }
val datatypedict = Map(
val datatypedict = Map(
"bool" -> BooleanType, "bool" -> BooleanType,
"int" -> IntegerType, "int" -> IntegerType,
"uint" -> IntegerType, "uint" -> IntegerType,
@ -453,7 +435,6 @@ object MagUtility extends Serializable {
case "repository" => case "repository" =>
result = new Publication() result = new Publication()
result.setDataInfo(MAGDataInfoInvisible)
qualifier( qualifier(
"0038", "0038",
"Other literature type", "Other literature type",
@ -488,7 +469,6 @@ object MagUtility extends Serializable {
} }
if (result != null) { if (result != null) {
if (result.getDataInfo == null)
result.setDataInfo(MAGDataInfo) result.setDataInfo(MAGDataInfo)
val i = new Instance val i = new Instance
i.setInstancetype(tp) i.setInstancetype(tp)
@ -512,7 +492,7 @@ object MagUtility extends Serializable {
return null return null
result.setCollectedfrom(List(MAGCollectedFrom).asJava) result.setCollectedfrom(List(MAGCollectedFrom).asJava)
val pidList = List( var pidList = List(
structuredProperty( structuredProperty(
paper.paperId.get.toString, paper.paperId.get.toString,
qualifier( qualifier(
@ -525,7 +505,7 @@ object MagUtility extends Serializable {
) )
) )
result.setPid(pidList.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava)
@ -618,10 +598,9 @@ object MagUtility extends Serializable {
} }
val instance = result.getInstance().get(0) val instance = result.getInstance().get(0)
instance.setPid(pidList.asJava)
if (paper.doi.orNull != null) if (paper.doi.orNull != null) {
instance.setAlternateIdentifier( pidList = pidList ::: List(
List(
structuredProperty( structuredProperty(
paper.doi.get, paper.doi.get,
qualifier( qualifier(
@ -632,8 +611,10 @@ object MagUtility extends Serializable {
), ),
null null
) )
).asJava
) )
}
instance.setPid(pidList.asJava)
result.setPid(pidList.asJava)
instance.setUrl(paper.urls.get.asJava) instance.setUrl(paper.urls.get.asJava)
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
instance.setCollectedfrom(MAGCollectedFrom) instance.setCollectedfrom(MAGCollectedFrom)

View File

@ -35,9 +35,12 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = { def convertMAG(spark: SparkSession, magBasePath: String, mdStorePath: String): Unit = {
import spark.implicits._ import spark.implicits._
spark.read spark.read
.load(s"$magBasePath/mag_denormalized") .load(s"$magBasePath/mag_denormalized")
.as[MAGPaper] .as[MAGPaper]
.filter(col("doi").isNotNull)
.map(s => MagUtility.convertMAGtoOAF(s)) .map(s => MagUtility.convertMAGtoOAF(s))
.filter(s => s != null) .filter(s => s != null)
.write .write

View File

@ -3,13 +3,17 @@ package eu.dnetlib.dhp.collection.mag
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result} import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
class MAGMappingTest { class MAGMappingTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
def mappingTest(): Unit = { def mappingTest(): Unit = {
val spark = SparkSession val spark = SparkSession
@ -18,12 +22,12 @@ class MAGMappingTest {
.master("local[*]") .master("local[*]")
.getOrCreate() .getOrCreate()
val s = new SparkMagOrganizationAS(null, null, null) val s = new SparkMAGtoOAF(null, null, null)
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
} }
@Test @Test
def mappingMagType(): Unit = { def mappingMagType(): Unit = {