diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala index b2fddec203..1b13b81c77 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala @@ -80,7 +80,7 @@ object SparkResolveRelation { } - private def extractPidsFromRecord(input:String):(String,List[(String,String)]) = { + def extractPidsFromRecord(input:String):(String,List[(String,String)]) = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) val id:String = (json \ "id").extract[String] @@ -90,7 +90,15 @@ object SparkResolveRelation { JField("qualifier", JObject(qualifier)) <- pids JField("classname", JString(pidType)) <- qualifier } yield (pidValue, pidType) - (id,result) + + val alternateIds: List[(String,String)] = for { + JObject(pids) <- json \\ "alternateIdentifier" + JField("value", JString(pidValue)) <- pids + JField("qualifier", JObject(qualifier)) <- pids + JField("classname", JString(pidType)) <- qualifier + } yield (pidValue, pidType) + + (id,result:::alternateIds) } private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala index 9a49deebca..377bd902da 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.sx.graph.bio.pubmed - +import java.util.regex.Pattern import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants -import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} +import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ import scala.collection.JavaConverters._ @@ -15,6 +15,20 @@ object PubMedToOaf { "doi" -> "https://dx.doi.org/" ) + def cleanDoi(doi:String):String = { + + val regex = "10.\\d{4,9}\\/[-._;()\\/:A-Z0-9]+$" + + + val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE) + val matcher = pattern.matcher(doi) + + if (matcher.find) { + return matcher.group(0) + } + null + } + def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) result_typologies.getClassid match { @@ -60,8 +74,12 @@ object PubMedToOaf { var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) if (pidList == null) return null + + var alternateIdentifier :StructuredProperty = null if (article.getDoi != null) { - pidList = pidList ::: List(OafMapperUtils.structuredProperty(article.getDoi, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) + val normalizedPid = cleanDoi(article.getDoi) + if (normalizedPid!= null) + alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) } // If the article contains the typology Journal Article then we apply this type @@ -84,9 +102,9 @@ object PubMedToOaf { return result result.setDataInfo(dataInfo) i.setPid(pidList.asJava) + if (alternateIdentifier!= null) + i.setAlternateIdentifier(List(alternateIdentifier).asJava) result.setInstance(List(i).asJava) - - i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut) val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala index 894c0a795e..8e063db7c0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala @@ -1,9 +1,12 @@ package eu.dnetlib.dhp.sx.graph.bio.pubmed import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF +import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -64,6 +67,9 @@ class BioScholixTest extends AbstractVocabularyTest{ assertEquals(10, r.size) assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p))) println(mapper.writeValueAsString(r.head)) + + + } @@ -179,13 +185,6 @@ class BioScholixTest extends AbstractVocabularyTest{ val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) assertTrue(result.nonEmpty) - - - - - - - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala index aaac2b7ee9..5b7fbe1cf5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala @@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, Ser import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary +import eu.dnetlib.dhp.sx.graph.SparkResolveRelation import eu.dnetlib.dhp.sx.graph.bio.pubmed.AbstractVocabularyTest import org.json4s import org.json4s.DefaultFormats @@ -30,6 +31,16 @@ class ScholixGraphTest extends AbstractVocabularyTest{ } + @Test + def testExtractPids():Unit = { + + val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/scholix/result.json")).mkString + val res =SparkResolveRelation.extractPidsFromRecord(input) + assertNotNull(res) + assertTrue(res._2.size == 2) + + } + @Test def testOAFToSummary():Unit= { val inputRelations = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary")).mkString diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/scholix/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/scholix/result.json new file mode 100644 index 0000000000..f14d5fedff --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/scholix/result.json @@ -0,0 +1,508 @@ +{ + "collectedfrom": [ + { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central", + "dataInfo": null + } + ], + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + }, + "lastupdatetimestamp": null, + "id": "50|pmid________::cd23b96c02d937c971c1b56d6aa0bf4f", + "originalId": [ + "10025635" + ], + "pid": [ + { + "value": "10025635", + "qualifier": { + "classid": "pmid", + "classname": "pmid", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "dateofcollection": null, + "dateoftransformation": null, + "extraInfo": null, + "oaiprovenance": null, + "measures": null, + "author": [ + { + "fullname": "D J, Marcellin-Little", + "name": "D J", + "surname": "Marcellin-Little", + "rank": 1, + "pid": null, + "affiliation": null + }, + { + "fullname": "B A, DeYoung", + "name": "B A", + "surname": "DeYoung", + "rank": 2, + "pid": null, + "affiliation": null + }, + { + "fullname": "D H, Doyens", + "name": "D H", + "surname": "Doyens", + "rank": 3, + "pid": null, + "affiliation": null + }, + { + "fullname": "D J, DeYoung", + "name": "D J", + "surname": "DeYoung", + "rank": 4, + "pid": null, + "affiliation": null + } + ], + "resulttype": { + "classid": "dataset", + "classname": "dataset", + "schemeid": "dnet:result_typologies", + "schemename": "dnet:result_typologies" + }, + "language": null, + "country": null, + "subject": [ + { + "value": "Animals", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Arthroplasty, Replacement, Hip", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Dogs", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Follow-Up Studies", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Hip Joint", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Hip Prosthesis", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Osseointegration", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Prospective Studies", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Radiography", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Survival Analysis", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + { + "value": "Treatment Outcome", + "qualifier": { + "classid": "keywords", + "classname": "keywords", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "title": [ + { + "value": "Canine uncemented porous-coated anatomic total hip arthroplasty: results of a long-term prospective evaluation of 50 consecutive cases.", + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "relevantdate": null, + "description": [ + { + "value": "To evaluate the long-term clinical and radiographic results of a canine uncemented porous-coated anatomic (PCA) total hip arthroplasty (THA).Prospective study of consecutive clinical patients using survival analysis.Forty-one dogs that underwent PCA THA; nine had bilateral PCA THA (50 prostheses).Gait observation, orthopedic examination, and radiographic assessment were conducted before THA, 6 months after THA, and yearly thereafter. A zonal analysis system was used to document osseous changes in the femur and the acetabulum. Acetabular cup and femoral stem subsidence and migration, femoral canal fill, and implant orientation were measured. Survival analysis of the procedure was conducted.Long-term follow-up was available for 37 dogs (46 prostheses). The median follow-up was 63 months. Limb function was normal for 37 limbs and abnormal for 9 limbs because of dislocation (n = 3), lumbosacral disease (n = 2), degenerative myelopathy (n = 1), autoimmune disease (n = 1), brain tumor (n = 1), or osteosarcoma of the femur (n = 1). All prosthetic stems and cups were fixed by bone ingrowth fixation. Osteolysis was not observed. Bone infarction occurred in five femoral canals (four dogs). The 6-year survival rate for the procedure was 87% (95% confidence interval, 72%-96%).Long-term fixation of the uncemented PCA acetabular cup and stem is successful in dogs, and long-term clinical function is excellent.", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "dateofacceptance": { + "value": "1999-02-20", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + "publisher": null, + "embargoenddate": null, + "source": null, + "fulltext": null, + "format": null, + "contributor": null, + "resourcetype": null, + "coverage": null, + "bestaccessright": null, + "context": null, + "externalReference": null, + "instance": [ + { + "license": null, + "accessright": null, + "instancetype": { + "classid": "0037", + "classname": "Clinical Trial", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + }, + "hostedby": null, + "url": [ + "https://pubmed.ncbi.nlm.nih.gov/10025635" + ], + "distributionlocation": null, + "collectedfrom": { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central", + "dataInfo": null + }, + "pid": [ + { + "value": "10025635", + "qualifier": { + "classid": "pmid", + "classname": "pmid", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "alternateIdentifier": [ + { + "value": "10.1053/jvet.1999.0010", + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + } + ], + "dateofacceptance": { + "value": "1999-02-20", + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:actionset", + "classname": "sysimport:actionset", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + } + } + }, + "processingchargeamount": null, + "processingchargecurrency": null, + "refereed": null + } + ], + "storagedate": null, + "device": null, + "size": null, + "version": null, + "lastmetadataupdate": null, + "metadataversionnumber": null, + "geolocation": null +}