implemented new datamodel including all the openAire typologies

This commit is contained in:
Sandro La Bruzzo 2023-04-04 09:06:47 +02:00
parent 9c1df15071
commit 1bbf408a08
3 changed files with 43 additions and 11 deletions

View File

@ -1,14 +1,23 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.oaf.{
Dataset,
OtherResearchProduct,
Publication,
Relation,
Result,
Software,
StructuredProperty
}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.schema.sx.summary.{AuthorPid, CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
@ -232,7 +241,16 @@ object ScholixUtils extends Serializable {
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
summaryObject.getAuthor.asScala
.map(a => {
if (a.getORCID != null)
new ScholixEntityId(
a.getFullname,
List(new ScholixIdentifier(a.getORCID, "ORCID", s"https://orcid.org/${a.getORCID}")).asJava
)
else new ScholixEntityId(a.getFullname, null)
})
.toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
@ -377,11 +395,13 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
r match {
case _: Publication => s.setTypology(Typology.publication)
case _: Dataset => s.setTypology(Typology.dataset)
case _: Software => s.setTypology(Typology.software)
case _: OtherResearchProduct => s.setTypology(Typology.otherresearchproduct)
case _ =>
}
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
@ -393,7 +413,20 @@ object ScholixUtils extends Serializable {
}
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
val authors: List[AuthorPid] = r.getAuthor.asScala
.map(a => {
var ORCID: String = null;
if (a.getPid != null) {
val result = a.getPid.asScala.find(p =>
p.getQualifier != null && p.getQualifier.getClassid != null && p.getQualifier.getClassid.toLowerCase
.contains("orcid")
)
if (result.isDefined)
ORCID = result.get.getValue
}
new AuthorPid(a.getFullname, ORCID)
})
.toList
if (authors.nonEmpty)
s.setAuthor(authors.asJava)
}

View File

@ -58,7 +58,7 @@ object SparkConvertObjectToJson {
case "scholix" =>
log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
// val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) {
val mp = toInt(maxPidNumberFilter).get
d

View File

@ -34,7 +34,6 @@ object SparkCreateSummaryObject {
log.info(s"targetPath -> $targetPath")
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]