1
0
Fork 0

completed mapping from paper to OAF, not tested

This commit is contained in:
Sandro La Bruzzo 2024-04-04 21:06:04 +02:00
parent 0794e0667b
commit 5142f462b5
3 changed files with 224 additions and 138 deletions

View File

@ -2,7 +2,16 @@ package eu.dnetlib.dhp.collection.mag
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.{ Dataset => OafDataset, Author, DataInfo, Instance, Journal, Publication, Qualifier, Result} import eu.dnetlib.dhp.schema.oaf.{
Dataset => OafDataset,
Author,
DataInfo,
Instance,
Journal,
Publication,
Qualifier,
Result
}
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
import eu.dnetlib.dhp.utils import eu.dnetlib.dhp.utils
@ -26,7 +35,6 @@ case class MAGPaper(
date: Option[String], date: Option[String],
onlineDate: Option[String], onlineDate: Option[String],
publisher: Option[String], publisher: Option[String],
// Journal or Conference information (one will be populated)
journalId: Option[Long], journalId: Option[Long],
journalName: Option[String], journalName: Option[String],
journalIssn: Option[String], journalIssn: Option[String],
@ -50,7 +58,6 @@ case class MAGPaper(
docSubTypes: Option[String], docSubTypes: Option[String],
createdDate: Option[String], createdDate: Option[String],
abstractText: Option[String], abstractText: Option[String],
// List of authors
authors: Option[List[MAGAuthor]], authors: Option[List[MAGAuthor]],
urls: Option[List[String]] urls: Option[List[String]]
) )
@ -335,10 +342,7 @@ object MagUtility extends Serializable {
} }
def getInstanceType(magType: Option[String], source: Option[String]): Result = { def createResultFromType(magType: Option[String], source: Option[String]): Result = {
var result: Result = null var result: Result = null
val di = new DataInfo val di = new DataInfo
di.setDeletedbyinference(false) di.setDeletedbyinference(false)
@ -357,12 +361,14 @@ object MagUtility extends Serializable {
result = new Publication result = new Publication
result.setDataInfo(di) result.setDataInfo(di)
val i = new Instance val i = new Instance
i.setInstancetype(qualifier( i.setInstancetype(
qualifier(
"0038", "0038",
"Other literature type", "Other literature type",
ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE ModelConstants.DNET_PUBLICATION_RESOURCE
)) )
)
result.setInstance(List(i).asJava) result.setInstance(List(i).asJava)
return result return result
@ -386,7 +392,7 @@ object MagUtility extends Serializable {
result = new Publication result = new Publication
qualifier("0043", "Journal", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE) qualifier("0043", "Journal", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
case "patent" => case "patent" =>
if (source.nonEmpty) { if (source != null) {
val s = source.get.toLowerCase val s = source.get.toLowerCase
if (s.contains("patent") || s.contains("brevet")) { if (s.contains("patent") || s.contains("brevet")) {
result = new Publication result = new Publication
@ -463,10 +469,19 @@ object MagUtility extends Serializable {
} }
def convertMAGtoOAF(paper: MAGPaper): Publication = { def convertMAGtoOAF(paper: MAGPaper): Result = {
val pub = new Publication
val magPid = structuredProperty( // FILTER all the MAG paper with no URL
if (paper.urls == null || paper.urls.get != null || paper.urls.get.isEmpty)
return null
val result = createResultFromType(paper.docType, paper.originalVenue)
if (result == null)
return null
val pidList = List(
structuredProperty(
paper.doi.get, paper.doi.get,
qualifier( qualifier(
PidType.mag_id.toString, PidType.mag_id.toString,
@ -476,11 +491,104 @@ object MagUtility extends Serializable {
), ),
null null
) )
)
if (paper.doi.isDefined) { result.setPid(pidList.asJava)
pub.setPid(
result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
val originalTitles = structuredProperty(paper.paperTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
result.setTitle(List(originalTitles).asJava)
if (paper.date.orNull != null) {
result.setDateofacceptance(field(paper.date.get, null))
} else {
if (paper.year.isDefined && paper.year.get > 1700) {
result.setDateofacceptance(field(s"${paper.year.get}-01-01", null))
}
}
if (paper.onlineDate.orNull != null) {
result.setRelevantdate(
List(
structuredProperty(
paper.onlineDate.get,
qualifier(
"published-online",
"published-online",
ModelConstants.DNET_DATACITE_DATE,
ModelConstants.DNET_DATACITE_DATE
),
null
)
).asJava
)
}
if (paper.publisher.orNull != null) {
result.setPublisher(field(paper.publisher.get, null))
}
if (paper.date.isDefined)
result.setDateofacceptance(field(paper.date.get, null))
if (paper.onlineDate.orNull != null)
result.setRelevantdate(
List(
structuredProperty(
paper.onlineDate.get,
qualifier(
"published-online",
"published-online",
ModelConstants.DNET_DATACITE_DATE,
ModelConstants.DNET_DATACITE_DATE
),
null
)
).asJava
)
if (paper.publisher.isDefined)
result.setPublisher(field(paper.publisher.get, null))
if (paper.journalId.isDefined && paper.journalName.isDefined) {
val j = new Journal
j.setName(paper.journalName.get)
j.setSp(paper.firstPage.orNull)
j.setEp(paper.lastPage.orNull)
if (paper.publisher.isDefined)
result.setPublisher(field(paper.publisher.get, null))
j.setIssnPrinted(paper.journalIssn.orNull)
j.setVol(paper.volume.orNull)
j.setIss(paper.issue.orNull)
j.setConferenceplace(paper.conferenceLocation.orNull)
result match {
case publication: Publication => publication.setJournal(j)
}
}
if (paper.abstractText.isDefined)
result.setDescription(List(field(paper.abstractText.get, null)).asJava)
if (paper.authors.isDefined && paper.authors.get.nonEmpty) {
result.setAuthor(
paper.authors.get
.filter(a => a.AuthorName.isDefined)
.map(a => {
val author = new Author
author.setFullname(a.AuthorName.get)
author
})
.asJava
)
}
val instance = result.getInstance().get(0)
instance.setPid(pidList.asJava)
instance.setAlternateIdentifier(
List( List(
magPid,
structuredProperty( structuredProperty(
paper.doi.get, paper.doi.get,
qualifier( qualifier(
@ -493,65 +601,54 @@ object MagUtility extends Serializable {
) )
).asJava ).asJava
) )
pub.setOriginalId(List(paper.paperId.get.toString, paper.doi.get).asJava) instance.setUrl(paper.urls.get.asJava)
} else { instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
pub.setPid( instance.setAccessright(accessRight(
List( ModelConstants.UNKNOWN,
magPid ModelConstants.NOT_AVAILABLE,
).asJava ModelConstants.DNET_ACCESS_MODES,
) ModelConstants.DNET_ACCESS_MODES
pub.setOriginalId(List(paper.paperId.get.toString).asJava) ))
}
pub.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}") if (paper.authors.orNull != null && paper.authors.get.nonEmpty)
result.setAuthor(
val mainTitles = structuredProperty(paper.originalTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
val originalTitles = structuredProperty(paper.paperTitle.get, ModelConstants.ALTERNATIVE_TITLE_QUALIFIER, null)
pub.setTitle(List(mainTitles, originalTitles).asJava)
if (paper.bookTitle.isDefined)
pub.setSource(List(field[String](paper.bookTitle.get, null)).asJava)
if (paper.abstractText.isDefined)
pub.setDescription(List(field(paper.abstractText.get, null)).asJava)
if (paper.authors.isDefined && paper.authors.get.nonEmpty) {
pub.setAuthor(
paper.authors.get paper.authors.get
.filter(a => a.AuthorName.isDefined) .filter(a => a.AuthorName.orNull != null)
.map(a => { .map { a =>
val author = new Author val author = new Author
author.setFullname(a.AuthorName.get) author.setFullname(a.AuthorName.get)
author var authorPid = List(
}) structuredProperty(
.asJava a.AuthorId.get.toString,
qualifier(
PidType.mag_id.toString,
PidType.mag_id.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
),
null
)
)
if (a.GridId.orNull != null) {
authorPid = authorPid ::: List(
structuredProperty(
a.AuthorId.get.toString,
qualifier(
PidType.mag_id.toString,
PidType.mag_id.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
),
null
)
) )
} }
author.setPid(authorPid.asJava)
if (paper.date.isDefined) author
pub.setDateofacceptance(field(paper.date.get, null))
if (paper.publisher.isDefined)
pub.setPublisher(field(paper.publisher.get, null))
if (paper.journalId.isDefined && paper.journalName.isDefined) {
val j = new Journal
j.setName(paper.journalName.get)
j.setSp(paper.firstPage.orNull)
j.setEp(paper.lastPage.orNull)
if (paper.publisher.isDefined)
pub.setPublisher(field(paper.publisher.get, null))
j.setIssnPrinted(paper.journalIssn.orNull)
j.setVol(paper.volume.orNull)
j.setIss(paper.issue.orNull)
j.setConferenceplace(paper.conferenceLocation.orNull)
j.setEdition(paper.conferenceName.orNull)
pub.setJournal(j)
} }
.asJava
pub )
result
} }
def convertInvertedIndexString(json_input: String): String = { def convertInvertedIndexString(json_input: String): String = {

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.collection.mag package eu.dnetlib.dhp.collection.mag
import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.application.AbstractScalaApplication
import org.apache.spark.sql.SparkSession import eu.dnetlib.dhp.schema.oaf.Result
import org.apache.spark.sql.{Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger) class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
@ -20,9 +21,20 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
def convertMAG(spark: SparkSession, workingPath: String, mdStorePath: String): Unit = { def convertMAG(spark: SparkSession, workingPath: String, mdStorePath: String): Unit = {
import spark.implicits._ import spark.implicits._
val papers = spark.read.load(s"$workingPath/mag").as[MAGPaper] implicit val resultEncoder = Encoders.bean(classOf[Result])
val total = papers.count()
log.info(s"TOTAL PAPERS: $total") spark.read
.load(s"$workingPath/mag")
.as[MAGPaper].show()
spark.read
.load(s"$workingPath/mag")
.as[MAGPaper]
.map(s => MagUtility.convertMAGtoOAF(s))
.write
.mode(SaveMode.Overwrite)
.json(mdStorePath)
} }
} }

View File

@ -14,6 +14,7 @@ class MAGMappingTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@Test
def mappingTest(): Unit = { def mappingTest(): Unit = {
val spark = SparkSession val spark = SparkSession
@ -22,13 +23,7 @@ class MAGMappingTest {
.master("local[*]") .master("local[*]")
.getOrCreate() .getOrCreate()
import spark.implicits._ new SparkMAGtoOAF(null,null,null).convertMAG(spark,"/Users/sandro/Downloads", "/Users/sandro/Downloads/mag_oaf")
val magDS = spark.read.load("/home/sandro/Downloads/mag").as[MAGPaper].where(col("journalId").isNotNull)
val paper = magDS.first()
print(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(MagUtility.convertMAGtoOAF(paper)))
} }
@ -36,47 +31,29 @@ class MAGMappingTest {
@Test @Test
def mappingMagType(): Unit = { def mappingMagType(): Unit = {
/*
+-----------+--------+
| docType| count|
+-----------+--------+
| null|79939635|
|BookChapter| 2431452|
| Dataset| 123923|
| Repository| 5044165|
| Thesis| 5525681|
| Conference| 5196866|
| Journal|89452763|
| Book| 4460017|
| Patent|64631955|
+-----------+--------+
"instancetype":{
"classid":"0001",
"classname":"Article",
"schemeid":"dnet:publication_resource",
"schemename":"dnet:publication_resource"},"instanceTypeMapping":[{"originalType":"journal-article","typeCode":null,"typeLabel":null,"vocabularyName":"openaire::coar_resource_types_3_1"}
*/
checkResult[Publication](MagUtility.getInstanceType(null, null), invisible = false,"Other literature type")
checkResult[Publication](MagUtility.getInstanceType(Some("BookChapter"), null), invisible = false,"Part of book or chapter of book")
checkResult[Publication](MagUtility.getInstanceType(Some("Book"), null), invisible = false,"Book")
checkResult[Publication](MagUtility.getInstanceType(Some("Repository"), null), invisible = true,"Other literature type")
checkResult[Publication](MagUtility.getInstanceType(Some("Thesis"), null), invisible = false,"Thesis")
checkResult[Publication](MagUtility.getInstanceType(Some("Conference"), null), invisible = false,"Article")
checkResult[Publication](MagUtility.getInstanceType(Some("Journal"), null), invisible = false,"Journal")
checkResult[Dataset](MagUtility.getInstanceType(Some("Dataset"), null), invisible = false,"Dataset")
checkResult[Publication](MagUtility.getInstanceType(Some("Patent"), Some("Patent Department of the Navy")), invisible = false,"Patent")
checkResult[Dataset](MagUtility.getInstanceType(Some("Dataset"), null), invisible = false,"Dataset")
checkResult[Publication](MagUtility.createResultFromType(null, null), invisible = false,"Other literature type")
checkResult[Publication](MagUtility.createResultFromType(Some("BookChapter"), null), invisible = false,"Part of book or chapter of book")
checkResult[Publication](MagUtility.createResultFromType(Some("Book"), null), invisible = false,"Book")
checkResult[Publication](MagUtility.createResultFromType(Some("Repository"), null), invisible = true,"Other literature type")
checkResult[Publication](MagUtility.createResultFromType(Some("Thesis"), null), invisible = false,"Thesis")
checkResult[Publication](MagUtility.createResultFromType(Some("Conference"), null), invisible = false,"Article")
checkResult[Publication](MagUtility.createResultFromType(Some("Journal"), null), invisible = false,"Journal")
checkResult[Dataset](MagUtility.createResultFromType(Some("Dataset"), null), invisible = false,"Dataset")
checkResult[Publication](MagUtility.createResultFromType(Some("Patent"), Some("Patent Department of the Navy")), invisible = false,"Patent")
checkResult[Publication](MagUtility.createResultFromType(Some("Patent"), Some("Brevet Department of the Navy")), invisible = false,"Patent")
checkResult[Publication](MagUtility.createResultFromType(Some("Patent"), Some("Journal of the Navy")), invisible = false,"Journal")
checkResult[Publication](MagUtility.createResultFromType(Some("Patent"), Some("Proceedings of the Navy")), invisible = false,"Article")
checkResult[Dataset](MagUtility.createResultFromType(Some("Dataset"), null), invisible = false,"Dataset")
assertNull(MagUtility.createResultFromType(Some("Patent"), null))
assertNull(MagUtility.createResultFromType(Some("Patent"), Some("Some name ")))
} }
def checkResult[T](r:Result, invisible:Boolean, typeName:String): Unit = { def checkResult[T](r:Result, invisible:Boolean, typeName:String): Unit = {
assertNotNull(r) assertNotNull(r)
assertTrue(r.isInstanceOf[T]) assertTrue(r.isInstanceOf[T])
assertNotNull(r.getDataInfo) assertNotNull(r.getDataInfo)