fixed wrong doi in pubmed

This commit is contained in:
Sandro La Bruzzo 2021-08-24 15:20:04 +02:00
parent 00a28c0080
commit 45898c71ac
5 changed files with 558 additions and 14 deletions

View File

@ -80,7 +80,7 @@ object SparkResolveRelation {
}
private def extractPidsFromRecord(input:String):(String,List[(String,String)]) = {
def extractPidsFromRecord(input:String):(String,List[(String,String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val id:String = (json \ "id").extract[String]
@ -90,7 +90,15 @@ object SparkResolveRelation {
JField("qualifier", JObject(qualifier)) <- pids
JField("classname", JString(pidType)) <- qualifier
} yield (pidValue, pidType)
(id,result)
val alternateIds: List[(String,String)] = for {
JObject(pids) <- json \\ "alternateIdentifier"
JField("value", JString(pidValue)) <- pids
JField("qualifier", JObject(qualifier)) <- pids
JField("classname", JString(pidType)) <- qualifier
} yield (pidValue, pidType)
(id,result:::alternateIds)
}
private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = {

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed
import java.util.regex.Pattern
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._
import scala.collection.JavaConverters._
@ -15,6 +15,20 @@ object PubMedToOaf {
"doi" -> "https://dx.doi.org/"
)
def cleanDoi(doi:String):String = {
val regex = "10.\\d{4,9}\\/[-._;()\\/:A-Z0-9]+$"
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
val matcher = pattern.matcher(doi)
if (matcher.find) {
return matcher.group(0)
}
null
}
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
result_typologies.getClassid match {
@ -60,8 +74,12 @@ object PubMedToOaf {
var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null)
return null
var alternateIdentifier :StructuredProperty = null
if (article.getDoi != null) {
pidList = pidList ::: List(OafMapperUtils.structuredProperty(article.getDoi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid!= null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
}
// If the article contains the typology Journal Article then we apply this type
@ -84,9 +102,9 @@ object PubMedToOaf {
return result
result.setDataInfo(dataInfo)
i.setPid(pidList.asJava)
if (alternateIdentifier!= null)
i.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(i).asJava)
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
@ -64,6 +67,9 @@ class BioScholixTest extends AbstractVocabularyTest{
assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
println(mapper.writeValueAsString(r.head))
}
@ -179,13 +185,6 @@ class BioScholixTest extends AbstractVocabularyTest{
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
assertTrue(result.nonEmpty)
}
}

View File

@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, Ser
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.SparkResolveRelation
import eu.dnetlib.dhp.sx.graph.bio.pubmed.AbstractVocabularyTest
import org.json4s
import org.json4s.DefaultFormats
@ -30,6 +31,16 @@ class ScholixGraphTest extends AbstractVocabularyTest{
}
@Test
def testExtractPids():Unit = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString
val res =SparkResolveRelation.extractPidsFromRecord(input)
assertNotNull(res)
assertTrue(res._2.size == 2)
}
@Test
def testOAFToSummary():Unit= {
val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString

View File

@ -0,0 +1,508 @@
{
"collectedfrom": [
{
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central",
"dataInfo": null
}
],
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
},
"lastupdatetimestamp": null,
"id": "50|pmid________::cd23b96c02d937c971c1b56d6aa0bf4f",
"originalId": [
"10025635"
],
"pid": [
{
"value": "10025635",
"qualifier": {
"classid": "pmid",
"classname": "pmid",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofcollection": null,
"dateoftransformation": null,
"extraInfo": null,
"oaiprovenance": null,
"measures": null,
"author": [
{
"fullname": "D J, Marcellin-Little",
"name": "D J",
"surname": "Marcellin-Little",
"rank": 1,
"pid": null,
"affiliation": null
},
{
"fullname": "B A, DeYoung",
"name": "B A",
"surname": "DeYoung",
"rank": 2,
"pid": null,
"affiliation": null
},
{
"fullname": "D H, Doyens",
"name": "D H",
"surname": "Doyens",
"rank": 3,
"pid": null,
"affiliation": null
},
{
"fullname": "D J, DeYoung",
"name": "D J",
"surname": "DeYoung",
"rank": 4,
"pid": null,
"affiliation": null
}
],
"resulttype": {
"classid": "dataset",
"classname": "dataset",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"language": null,
"country": null,
"subject": [
{
"value": "Animals",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Arthroplasty, Replacement, Hip",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Dogs",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Follow-Up Studies",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Hip Joint",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Hip Prosthesis",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Osseointegration",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Prospective Studies",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Radiography",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Survival Analysis",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
{
"value": "Treatment Outcome",
"qualifier": {
"classid": "keywords",
"classname": "keywords",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"title": [
{
"value": "Canine uncemented porous-coated anatomic total hip arthroplasty: results of a long-term prospective evaluation of 50 consecutive cases.",
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"relevantdate": null,
"description": [
{
"value": "To evaluate the long-term clinical and radiographic results of a canine uncemented porous-coated anatomic (PCA) total hip arthroplasty (THA).Prospective study of consecutive clinical patients using survival analysis.Forty-one dogs that underwent PCA THA; nine had bilateral PCA THA (50 prostheses).Gait observation, orthopedic examination, and radiographic assessment were conducted before THA, 6 months after THA, and yearly thereafter. A zonal analysis system was used to document osseous changes in the femur and the acetabulum. Acetabular cup and femoral stem subsidence and migration, femoral canal fill, and implant orientation were measured. Survival analysis of the procedure was conducted.Long-term follow-up was available for 37 dogs (46 prostheses). The median follow-up was 63 months. Limb function was normal for 37 limbs and abnormal for 9 limbs because of dislocation (n = 3), lumbosacral disease (n = 2), degenerative myelopathy (n = 1), autoimmune disease (n = 1), brain tumor (n = 1), or osteosarcoma of the femur (n = 1). All prosthetic stems and cups were fixed by bone ingrowth fixation. Osteolysis was not observed. Bone infarction occurred in five femoral canals (four dogs). The 6-year survival rate for the procedure was 87% (95% confidence interval, 72%-96%).Long-term fixation of the uncemented PCA acetabular cup and stem is successful in dogs, and long-term clinical function is excellent.",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1999-02-20",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"publisher": null,
"embargoenddate": null,
"source": null,
"fulltext": null,
"format": null,
"contributor": null,
"resourcetype": null,
"coverage": null,
"bestaccessright": null,
"context": null,
"externalReference": null,
"instance": [
{
"license": null,
"accessright": null,
"instancetype": {
"classid": "0037",
"classname": "Clinical Trial",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"hostedby": null,
"url": [
"https://pubmed.ncbi.nlm.nih.gov/10025635"
],
"distributionlocation": null,
"collectedfrom": {
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
"value": "Europe PubMed Central",
"dataInfo": null
},
"pid": [
{
"value": "10025635",
"qualifier": {
"classid": "pmid",
"classname": "pmid",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"alternateIdentifier": [
{
"value": "10.1053/jvet.1999.0010",
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
}
],
"dateofacceptance": {
"value": "1999-02-20",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"trust": "0.9",
"inferenceprovenance": null,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "sysimport:actionset",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
}
},
"processingchargeamount": null,
"processingchargecurrency": null,
"refereed": null
}
],
"storagedate": null,
"device": null,
"size": null,
"version": null,
"lastmetadataupdate": null,
"metadataversionnumber": null,
"geolocation": null
}