From 1bbf408a084a78e3411e9593d6b6ffe3e199c6e8 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 4 Apr 2023 09:06:47 +0200 Subject: [PATCH] implemented new datamodel including all the openAire typologies --- .../dhp/sx/graph/scholix/ScholixUtils.scala | 51 +++++++++++++++---- .../sx/graph/SparkConvertObjectToJson.scala | 2 +- .../sx/graph/SparkCreateSummaryObject.scala | 1 - 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index a995016a8..afc4f78b9 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -1,14 +1,23 @@ package eu.dnetlib.dhp.sx.graph.scholix -import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.oaf.{ + Dataset, + OtherResearchProduct, + Publication, + Relation, + Result, + Software, + StructuredProperty +} import eu.dnetlib.dhp.schema.sx.scholix._ -import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} +import eu.dnetlib.dhp.schema.sx.summary.{AuthorPid, CollectedFromType, SchemeValue, ScholixSummary, Typology} import eu.dnetlib.dhp.utils.DHPUtils import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.{Encoder, Encoders} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse + import scala.collection.JavaConverters._ import scala.io.Source @@ -232,7 +241,16 @@ object ScholixUtils extends Serializable { if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { val l: List[ScholixEntityId] = - summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList + summaryObject.getAuthor.asScala + .map(a => { + if (a.getORCID != null) + new ScholixEntityId( + a.getFullname, + List(new ScholixIdentifier(a.getORCID, "ORCID", s"https://orcid.org/${a.getORCID}")).asJava + ) + else new ScholixEntityId(a.getFullname, null) + }) + .toList if (l.nonEmpty) r.setCreator(l.asJava) } @@ -377,11 +395,13 @@ object ScholixUtils extends Serializable { if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - if (r.isInstanceOf[Publication]) - s.setTypology(Typology.publication) - else - s.setTypology(Typology.dataset) - + r match { + case _: Publication => s.setTypology(Typology.publication) + case _: Dataset => s.setTypology(Typology.dataset) + case _: Software => s.setTypology(Typology.software) + case _: OtherResearchProduct => s.setTypology(Typology.otherresearchproduct) + case _ => + } s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) if (r.getTitle != null && r.getTitle.asScala.nonEmpty) { @@ -393,7 +413,20 @@ object ScholixUtils extends Serializable { } if (r.getAuthor != null && !r.getAuthor.isEmpty) { - val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList + val authors: List[AuthorPid] = r.getAuthor.asScala + .map(a => { + var ORCID: String = null; + if (a.getPid != null) { + val result = a.getPid.asScala.find(p => + p.getQualifier != null && p.getQualifier.getClassid != null && p.getQualifier.getClassid.toLowerCase + .contains("orcid") + ) + if (result.isDefined) + ORCID = result.get.getValue + } + new AuthorPid(a.getFullname, ORCID) + }) + .toList if (authors.nonEmpty) s.setAuthor(authors.asJava) } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala index 6695ebd3c..04e242ee5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkConvertObjectToJson.scala @@ -58,7 +58,7 @@ object SparkConvertObjectToJson { case "scholix" => log.info("Serialize Scholix") val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] -// val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] + val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) { val mp = toInt(maxPidNumberFilter).get d diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala index 6d489e8cb..565d41720 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateSummaryObject.scala @@ -34,7 +34,6 @@ object SparkCreateSummaryObject { log.info(s"targetPath -> $targetPath") implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]