fixed wrong doi in pubmed

This commit is contained in:
Sandro La Bruzzo 2021-08-24 15:20:04 +02:00
parent 00a28c0080
commit 45898c71ac
5 changed files with 558 additions and 14 deletions

View File

@ -80,7 +80,7 @@ object SparkResolveRelation {
} }
private def extractPidsFromRecord(input:String):(String,List[(String,String)]) = { def extractPidsFromRecord(input:String):(String,List[(String,String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
val id:String = (json \ "id").extract[String] val id:String = (json \ "id").extract[String]
@ -90,7 +90,15 @@ object SparkResolveRelation {
JField("qualifier", JObject(qualifier)) <- pids JField("qualifier", JObject(qualifier)) <- pids
JField("classname", JString(pidType)) <- qualifier JField("classname", JString(pidType)) <- qualifier
} yield (pidValue, pidType) } yield (pidValue, pidType)
(id,result)
val alternateIds: List[(String,String)] = for {
JObject(pids) <- json \\ "alternateIdentifier"
JField("value", JString(pidValue)) <- pids
JField("qualifier", JObject(qualifier)) <- pids
JField("classname", JString(pidType)) <- qualifier
} yield (pidValue, pidType)
(id,result:::alternateIds)
} }
private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = { private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = {

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed package eu.dnetlib.dhp.sx.graph.bio.pubmed
import java.util.regex.Pattern
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@ -15,6 +15,20 @@ object PubMedToOaf {
"doi" -> "https://dx.doi.org/" "doi" -> "https://dx.doi.org/"
) )
def cleanDoi(doi:String):String = {
val regex = "10.\\d{4,9}\\/[-._;()\\/:A-Z0-9]+$"
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi)
if (matcher.find) {
return matcher.group(0)
}
null
}
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
result_typologies.getClassid match { result_typologies.getClassid match {
@ -60,8 +74,12 @@ object PubMedToOaf {
var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null) if (pidList == null)
return null return null
var alternateIdentifier :StructuredProperty = null
if (article.getDoi != null) { if (article.getDoi != null) {
pidList = pidList ::: List(OafMapperUtils.structuredProperty(article.getDoi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid!= null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
} }
// If the article contains the typology Journal Article then we apply this type // If the article contains the typology Journal Article then we apply this type
@ -84,9 +102,9 @@ object PubMedToOaf {
return result return result
result.setDataInfo(dataInfo) result.setDataInfo(dataInfo)
i.setPid(pidList.asJava) i.setPid(pidList.asJava)
if (alternateIdentifier!= null)
i.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(i).asJava) result.setInstance(List(i).asJava)
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut) i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
val urlLists: List[String] = pidList val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed package eu.dnetlib.dhp.sx.graph.bio.pubmed
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -64,6 +67,9 @@ class BioScholixTest extends AbstractVocabularyTest{
assertEquals(10, r.size) assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p))) assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
println(mapper.writeValueAsString(r.head)) println(mapper.writeValueAsString(r.head))
} }
@ -179,13 +185,6 @@ class BioScholixTest extends AbstractVocabularyTest{
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty) assertTrue(result.nonEmpty)
} }
} }

View File

@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, Ser
import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.SparkResolveRelation
import eu.dnetlib.dhp.sx.graph.bio.pubmed.AbstractVocabularyTest import eu.dnetlib.dhp.sx.graph.bio.pubmed.AbstractVocabularyTest
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
@ -30,6 +31,16 @@ class ScholixGraphTest extends AbstractVocabularyTest{
} }
@Test
def testExtractPids():Unit = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
val res =SparkResolveRelation.extractPidsFromRecord(input)
assertNotNull(res)
assertTrue(res._2.size == 2)
}
@Test @Test
def testOAFToSummary():Unit= { def testOAFToSummary():Unit= {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString

View File

@ -0,0 +1,508 @@
{
"collectedfrom": [
{
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central",
"dataInfo": null
}
],
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
},
"lastupdatetimestamp": null,
"id": "50|pmid________::cd23b96c02d937c971c1b56d6aa0bf4f",
"originalId": [
"10025635"
],
"pid": [
{
"value": "10025635",
"qualifier": {
"classid": "pmid",
"classname": "pmid",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofcollection": null,
"dateoftransformation": null,
"extraInfo": null,
"oaiprovenance": null,
"measures": null,
"author": [
{
"fullname": "D J, Marcellin-Little",
"name": "D J",
"surname": "Marcellin-Little",
"rank": 1,
"pid": null,
"affiliation": null
},
{
"fullname": "B A, DeYoung",
"name": "B A",
"surname": "DeYoung",
"rank": 2,
"pid": null,
"affiliation": null
},
{
"fullname": "D H, Doyens",
"name": "D H",
"surname": "Doyens",
"rank": 3,
"pid": null,
"affiliation": null
},
{
"fullname": "D J, DeYoung",
"name": "D J",
"surname": "DeYoung",
"rank": 4,
"pid": null,
"affiliation": null
}
],
"resulttype": {
"classid": "dataset",
"classname": "dataset",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"language": null,
"country": null,
"subject": [
{
"value": "Animals",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Arthroplasty, Replacement, Hip",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Dogs",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Follow-Up Studies",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Hip Joint",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Hip Prosthesis",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Osseointegration",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Prospective Studies",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Radiography",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Survival Analysis",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Treatment Outcome",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"title": [
{
"value": "Canine uncemented porous-coated anatomic total hip arthroplasty: results of a long-term prospective evaluation of 50 consecutive cases.",
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"relevantdate": null,
"description": [
{
"value": "To evaluate the long-term clinical and radiographic results of a canine uncemented porous-coated anatomic (PCA) total hip arthroplasty (THA).Prospective study of consecutive clinical patients using survival analysis.Forty-one dogs that underwent PCA THA; nine had bilateral PCA THA (50 prostheses).Gait observation, orthopedic examination, and radiographic assessment were conducted before THA, 6 months after THA, and yearly thereafter. A zonal analysis system was used to document osseous changes in the femur and the acetabulum. Acetabular cup and femoral stem subsidence and migration, femoral canal fill, and implant orientation were measured. Survival analysis of the procedure was conducted.Long-term follow-up was available for 37 dogs (46 prostheses). The median follow-up was 63 months. Limb function was normal for 37 limbs and abnormal for 9 limbs because of dislocation (n = 3), lumbosacral disease (n = 2), degenerative myelopathy (n = 1), autoimmune disease (n = 1), brain tumor (n = 1), or osteosarcoma of the femur (n = 1). All prosthetic stems and cups were fixed by bone ingrowth fixation. Osteolysis was not observed. Bone infarction occurred in five femoral canals (four dogs). The 6-year survival rate for the procedure was 87% (95% confidence interval, 72%-96%).Long-term fixation of the uncemented PCA acetabular cup and stem is successful in dogs, and long-term clinical function is excellent.",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1999-02-20",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"publisher": null,
"embargoenddate": null,
"source": null,
"fulltext": null,
"format": null,
"contributor": null,
"resourcetype": null,
"coverage": null,
"bestaccessright": null,
"context": null,
"externalReference": null,
"instance": [
{
"license": null,
"accessright": null,
"instancetype": {
"classid": "0037",
"classname": "Clinical Trial",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"hostedby": null,
"url": [
"https://pubmed.ncbi.nlm.nih.gov/10025635"
],
"distributionlocation": null,
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central",
"dataInfo": null
},
"pid": [
{
"value": "10025635",
"qualifier": {
"classid": "pmid",
"classname": "pmid",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"alternateIdentifier": [
{
"value": "10.1053/jvet.1999.0010",
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1999-02-20",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"processingchargeamount": null,
"processingchargecurrency": null,
"refereed": null
}
],
"storagedate": null,
"device": null,
"size": null,
"version": null,
"lastmetadataupdate": null,
"metadataversionnumber": null,
"geolocation": null
}