Merge branch 'doidoost_dismiss' of https://code-repo.d4science.org/D-Net/dnet-hadoop into doidoost_dismiss

This commit is contained in:
Miriam Baglioni 2024-04-04 09:16:18 +02:00
commit 0794e0667b
5 changed files with 309 additions and 89 deletions

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.collection.crossref package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
@ -15,6 +15,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
} }
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
import org.apache.spark.sql.Row
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST._ import org.json4s.JsonAST._
@ -650,6 +651,33 @@ case object Crossref2Oaf {
r r
} }
def generateAffliation(input: Row): List[String] = {
val doi = input.getString(0)
val rorId = input.getString(1)
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
val r: Relation = new Relation
DoiCleaningRule.clean(doi)
r.setSource(pubId)
r.setTarget(affId)
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
r.setRelClass(ModelConstants.HAS_AUTHOR_INSTITUTION)
r.setSubRelType(ModelConstants.AFFILIATION)
r.setDataInfo(generateDataInfo())
r.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava)
val r1: Relation = new Relation
r1.setTarget(pubId)
r1.setSource(affId)
r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
r1.setRelClass(ModelConstants.IS_AUTHOR_INSTITUTION_OF)
r1.setSubRelType(ModelConstants.AFFILIATION)
r1.setDataInfo(generateDataInfo())
r1.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava)
List(mapper.writeValueAsString(r), mapper.writeValueAsString(r1))
}
def convert(input: String, vocabularies: VocabularyGroup, mode: TransformationType): List[Oaf] = { def convert(input: String, vocabularies: VocabularyGroup, mode: TransformationType): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)

View File

@ -7,7 +7,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.functions.{col, lower} import org.apache.spark.sql.functions.{col, explode, lower}
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
@ -104,6 +104,22 @@ class SparkMapDumpIntoOAF(propertyPath: String, args: Array[String], log: Logger
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.text(s"$targetPath") .text(s"$targetPath")
// Generate affiliation relations:
spark.read
.json(sourcePath)
.select(col("DOI"), explode(col("author.affiliation")).alias("affiliations"))
.select(col("DOI"), explode(col("affiliations.id")).alias("aids"))
.where("aids is not null")
.select(col("DOI"), explode(col("aids")).alias("aff"))
.select(col("DOI"), col("aff.id").alias("id"), col("aff.id-type").alias("idType"))
.where(col("idType").like("ROR"))
.flatMap(r => Crossref2Oaf.generateAffliation(r))
.write
.mode(SaveMode.Append)
.option("compression", "gzip")
.text(s"$targetPath")
} }
} }

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.collection.mag package eu.dnetlib.dhp.collection.mag
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, Journal, Publication} import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, PidType} import eu.dnetlib.dhp.schema.oaf.{ Dataset => OafDataset, Author, DataInfo, Instance, Journal, Publication, Qualifier, Result}
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
import eu.dnetlib.dhp.utils
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.json4s import org.json4s
@ -304,7 +307,8 @@ object MagUtility extends Serializable {
def getSchema(streamName: String): StructType = { def getSchema(streamName: String): StructType = {
var schema = new StructType() var schema = new StructType()
val d: Seq[String] = stream(streamName)._2 val d: Seq[String] = stream(streamName)._2
d.foreach { case t => d.foreach {
case t =>
val currentType = t.split(":") val currentType = t.split(":")
val fieldName: String = currentType.head val fieldName: String = currentType.head
var fieldType: String = currentType.last var fieldType: String = currentType.last
@ -331,12 +335,152 @@ object MagUtility extends Serializable {
} }
def getInstanceType(magType: Option[String], source: Option[String]): Result = {
var result:Result = null
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust("0.9")
di.setProvenanceaction(
OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS
)
)
if (magType== null) {
result = new Publication
result.setDataInfo(di)
val i =new Instance
i.setInstancetype(qualifier(
"0038",
"Other literature type",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
))
result.setInstance(List(i).asJava)
return result
}
val currentType: String = magType.get
val tp = currentType.toLowerCase match {
case "book" =>
result = new Publication
qualifier("0002", "Book", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
case "bookchapter" =>
result = new Publication
qualifier(
"00013",
"Part of book or chapter of book",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
case "journal" =>
result = new Publication
qualifier("0043", "Journal", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
case "patent" =>
if (source.nonEmpty) {
val s = source.get.toLowerCase
if (s.contains("patent") || s.contains("brevet")) {
result = new Publication
qualifier(
"0019",
"Patent",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
} else if (s.contains("journal of")) {
result = new Publication
qualifier(
"0043",
"Journal",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
} else if (s.contains("proceedings") || s.contains("conference") || s.contains("workshop") || s.contains(
"symposium"
)) {
result = new Publication
qualifier(
"0001",
"Article",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
} else null
} else null
case "repository" =>
result = new Publication()
di.setInvisible(true)
qualifier(
"0038",
"Other literature type",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
case "thesis" =>
result = new Publication
qualifier(
"0044",
"Thesis",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
case "dataset" =>
result = new OafDataset
qualifier(
"0021",
"Dataset",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
case "conference" =>
result = new Publication
qualifier(
"0001",
"Article",
ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE
)
}
if (result != null) {
result.setDataInfo(di)
val i =new Instance
i.setInstancetype(tp)
result.setInstance(List(i).asJava)
}
result
}
def convertMAGtoOAF(paper: MAGPaper): Publication = { def convertMAGtoOAF(paper: MAGPaper): Publication = {
val pub = new Publication
val magPid = structuredProperty(
paper.doi.get,
qualifier(
PidType.mag_id.toString,
PidType.mag_id.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES
),
null
)
if (paper.doi.isDefined) { if (paper.doi.isDefined) {
val pub = new Publication
pub.setPid( pub.setPid(
List( List(
magPid,
structuredProperty( structuredProperty(
paper.doi.get, paper.doi.get,
qualifier( qualifier(
@ -349,13 +493,17 @@ object MagUtility extends Serializable {
) )
).asJava ).asJava
) )
pub.setOriginalId(List(paper.paperId.get.toString, paper.doi.get).asJava) pub.setOriginalId(List(paper.paperId.get.toString, paper.doi.get).asJava)
} else {
pub.setPid(
List(
magPid
).asJava
)
pub.setOriginalId(List(paper.paperId.get.toString).asJava)
}
//IMPORTANT pub.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
//The old method result.setId(generateIdentifier(result, doi))
//will be replaced using IdentifierFactory
pub.setId(IdentifierFactory.createDOIBoostIdentifier(pub))
val mainTitles = structuredProperty(paper.originalTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) val mainTitles = structuredProperty(paper.originalTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
@ -374,22 +522,10 @@ object MagUtility extends Serializable {
.map(a => { .map(a => {
val author = new Author val author = new Author
author.setFullname(a.AuthorName.get) author.setFullname(a.AuthorName.get)
if (a.AffiliationName.isDefined)
author.setAffiliation(List(field(a.AffiliationName.get, null)).asJava)
author.setPid(
List(
structuredProperty(
s"https://academic.microsoft.com/#/detail/${a.AuthorId.get}",
qualifier("url", "url", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES),
null
)
).asJava
)
author author
}) })
.asJava .asJava
) )
} }
if (paper.date.isDefined) if (paper.date.isDefined)
@ -415,9 +551,6 @@ object MagUtility extends Serializable {
} }
pub pub
} else {
null
}
} }

View File

@ -2,13 +2,11 @@ package eu.dnetlib.dhp.collection.crossref
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import org.apache.spark.sql.SparkSession import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.api.{BeforeEach, Test}
import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.extension.ExtendWith
import org.mockito.junit.jupiter.MockitoExtension import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension])) @ExtendWith(Array(classOf[MockitoExtension]))
class CrossrefMappingTest extends AbstractVocabularyTest { class CrossrefMappingTest extends AbstractVocabularyTest {
@ -21,21 +19,6 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
super.setUpVocabulary() super.setUpVocabulary()
} }
def testMapping(): Unit = {
val spark = SparkSession.builder().master("local[*]").appName("TransformCrossref").getOrCreate()
val s = new SparkMapDumpIntoOAF(null, null, null)
import spark.implicits._
s.transformCrossref(
spark,
sourcePath = "/home/sandro/Downloads/crossref",
targetPath = "/home/sandro/Downloads/crossref_transformed",
unpaywallPath = null,
vocabularies = vocabularies
)
print(spark.read.text("/home/sandro/Downloads/crossref_transformed").count)
}
} }

View File

@ -1,15 +1,19 @@
package eu.dnetlib.dhp.collection.mag package eu.dnetlib.dhp.collection.mag
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.junit.jupiter.api.Assertions._
class MAGMappingTest { class MAGMappingTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@Test
def mappingTest(): Unit = { def mappingTest(): Unit = {
val spark = SparkSession val spark = SparkSession
@ -28,4 +32,60 @@ class MAGMappingTest {
} }
@Test
def mappingMagType(): Unit = {
/*
+-----------+--------+
| docType| count|
+-----------+--------+
| null|79939635|
|BookChapter| 2431452|
| Dataset| 123923|
| Repository| 5044165|
| Thesis| 5525681|
| Conference| 5196866|
| Journal|89452763|
| Book| 4460017|
| Patent|64631955|
+-----------+--------+
"instancetype":{
"classid":"0001",
"classname":"Article",
"schemeid":"dnet:publication_resource",
"schemename":"dnet:publication_resource"},"instanceTypeMapping":[{"originalType":"journal-article","typeCode":null,"typeLabel":null,"vocabularyName":"openaire::coar_resource_types_3_1"}
*/
checkResult[Publication](MagUtility.getInstanceType(null, null), invisible = false,"Other literature type")
checkResult[Publication](MagUtility.getInstanceType(Some("BookChapter"), null), invisible = false,"Part of book or chapter of book")
checkResult[Publication](MagUtility.getInstanceType(Some("Book"), null), invisible = false,"Book")
checkResult[Publication](MagUtility.getInstanceType(Some("Repository"), null), invisible = true,"Other literature type")
checkResult[Publication](MagUtility.getInstanceType(Some("Thesis"), null), invisible = false,"Thesis")
checkResult[Publication](MagUtility.getInstanceType(Some("Conference"), null), invisible = false,"Article")
checkResult[Publication](MagUtility.getInstanceType(Some("Journal"), null), invisible = false,"Journal")
checkResult[Dataset](MagUtility.getInstanceType(Some("Dataset"), null), invisible = false,"Dataset")
checkResult[Publication](MagUtility.getInstanceType(Some("Patent"), Some("Patent Department of the Navy")), invisible = false,"Patent")
checkResult[Dataset](MagUtility.getInstanceType(Some("Dataset"), null), invisible = false,"Dataset")
}
def checkResult[T](r:Result, invisible:Boolean, typeName:String): Unit = {
assertNotNull(r)
assertTrue(r.isInstanceOf[T])
assertNotNull(r.getDataInfo)
assertEquals( invisible ,r.getDataInfo.getInvisible)
assertNotNull(r.getInstance())
assertTrue(r.getInstance().size()>0)
assertNotNull(r.getInstance().get(0).getInstancetype)
assertEquals(typeName, r.getInstance().get(0).getInstancetype.getClassname)
}
} }