From f11dfc51f7056d076f8bf3c7b7f2fcb1ce54c1c1 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Tue, 22 Mar 2022 16:39:21 +0100 Subject: [PATCH] fix resolved url format, added alternate identifier from original pid --- .../bioschema/BioschemaModelConstants.scala | 20 +++-- .../BioschemaToOAFTransformation.scala | 80 ++++++++++--------- 2 files changed, 55 insertions(+), 45 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala index d3d4d508a..ce56305f5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaModelConstants.scala @@ -12,12 +12,17 @@ import java.util.regex.Pattern The following class are utility class used for the mapping from bioschema json datacite to OAF Schema */ + case class RelatedIdentifierType( relationType: String, relatedIdentifier: String, relatedIdentifierType: String ) {} +case class AlternateIdentifierType( + alternateIdentifier: String +) {} + case class IdentifierType( identifier: String, identifierType: String @@ -41,10 +46,10 @@ case class CreatorType( case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} case class SubjectType( - schemeURI: Option[String], - value: Option[String], - subjectScheme: Option[String] - ) {} + schemeURI: Option[String], + value: Option[String], + subjectScheme: Option[String] +) {} case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} @@ -70,9 +75,10 @@ object BioschemaModelConstants { val PED_PREFIX: String = "ped_________" - val resolvedURL: Map[String, String] = Map( - "uniprot" -> "https://www.uniprot.org/uniprot/", - "pubmed" -> "https://pubmed.ncbi.nlm.nih.gov/" + val resolvedURLPattern: Map[String, String] = Map( + "https://identifiers.org/pubmed:" -> "pubmed", + "http://purl.uniprot.org/uniprot/" -> "uniprot", + "https://identifiers.org/uniprot:" -> "uniprot" ) val collectedFromMap: Map[String, KeyValue] = { diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala index 72a0d5ed1..a8aad8e07 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/bioschema/BioschemaToOAFTransformation.scala @@ -11,9 +11,7 @@ import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse -import java.text.SimpleDateFormat import java.time.LocalDate -import java.util.{Date, Locale} import scala.collection.JavaConverters._ object BioschemaToOAFTransformation { @@ -200,32 +198,42 @@ object BioschemaToOAFTransformation { if (result.getId == null) return List() - val alternativeIdentifierUrls: List[String] = for { + val alternativeIdentifierUrls: List[AlternateIdentifierType] = for { JObject(alternateIdentifiers) <- json \\ "alternateIdentifiers" JField("alternateIdentifier", JString(alternateIdentifier)) <- alternateIdentifiers - foundResolvedURLId = resolvedURL - .map(k => { - if (alternateIdentifier.contains(s"${k._1}:")) - k._1 - else - null - }) - .find(s => s != null) - alternativeIdentifierUrl = StringUtils.substringAfter(alternateIdentifier, s"${foundResolvedURLId.get}:") - } yield alternativeIdentifierUrl + } yield AlternateIdentifierType(alternateIdentifier) - alternativeIdentifierUrls.map(id => { - var alternateIdentifier: StructuredProperty = null - alternateIdentifier = OafMapperUtils.structuredProperty( - id, - "uniprot", - "uniprot", - ModelConstants.DNET_PID_TYPES, - ModelConstants.DNET_PID_TYPES, - dataInfo - ) - instance.setAlternateIdentifier(List(alternateIdentifier).asJava) - }) + val alternativeIdentifierUrl: AlternateIdentifierType = alternativeIdentifierUrls.asJava.get(0) + + val alternativeIdentifiers = resolvedURLPattern + .map(pattern => { + if (alternativeIdentifierUrl.alternateIdentifier.startsWith(s"${pattern._1}")) { + val relatedId = StringUtils.substringAfter(alternativeIdentifierUrl.alternateIdentifier, s"${pattern._1}") + OafMapperUtils.structuredProperty( + relatedId, + pattern._2, + pattern._2, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + } else + null + }) + .find(s => s != null) + .get + + val defaultAlternatedIdentifer: StructuredProperty = OafMapperUtils.structuredProperty( + pid, + datasourceKey, + datasourceKey, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + var finalAlternativeIdentifiers: List[StructuredProperty] = List() + finalAlternativeIdentifiers = List(alternativeIdentifiers) ::: List(defaultAlternatedIdentifer) + instance.setAlternateIdentifier(finalAlternativeIdentifiers.asJava) if (exportLinks) { val rels: List[RelatedIdentifierType] = for { @@ -272,21 +280,17 @@ object BioschemaToOAFTransformation { rel.setProperties(List(dateProps).asJava) - val foundResolvedURLId = resolvedURL - .map(k => { - if (r.relatedIdentifier.contains(s"${k._1}:")) - k._1 - else + resolvedURLPattern + .map(p => { + if (r.relatedIdentifier.startsWith(s"${p._1}")) { + val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${p._1}") + rel.setTarget( + DHPUtils.generateUnresolvedIdentifier(relatedId, p._2) + ) + } else null }) - .find(s => s != null); - if (foundResolvedURLId.nonEmpty) { - val relatedId = StringUtils.substringAfter(r.relatedIdentifier, s"${foundResolvedURLId.get}:") - rel.setTarget(s"${resolvedURL(foundResolvedURLId.get)}${relatedId}") - } else - rel.setTarget( - DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) - ) + .find(s => s != null) rel.setSource(id) rel.setCollectedfrom(List(collectedFromMap(datasourceKey)).asJava) rel.getCollectedfrom.asScala.map(c => c.getValue).toList