Compare commits

...

2 Commits

Author SHA1 Message Date
Sandro La Bruzzo 2984e8e59f updated index mapping to include orcid 2023-04-13 10:51:10 +02:00
Sandro La Bruzzo 1bbf408a08 implemented new datamodel including all the openAire typologies 2023-04-04 09:06:47 +02:00
5 changed files with 52 additions and 16 deletions

View File

@ -1,14 +1,23 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.oaf.{
Dataset,
OtherResearchProduct,
Publication,
Relation,
Result,
Software,
StructuredProperty
}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.schema.sx.summary.{AuthorPid, CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
@ -232,7 +241,16 @@ object ScholixUtils extends Serializable {
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
summaryObject.getAuthor.asScala
.map(a => {
if (a.getORCID != null)
new ScholixEntityId(
a.getFullname,
List(new ScholixIdentifier(a.getORCID, "ORCID", s"https://orcid.org/${a.getORCID}")).asJava
)
else new ScholixEntityId(a.getFullname, null)
})
.toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
@ -377,11 +395,13 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
r match {
case _: Publication => s.setTypology(Typology.publication)
case _: Dataset => s.setTypology(Typology.dataset)
case _: Software => s.setTypology(Typology.software)
case _: OtherResearchProduct => s.setTypology(Typology.otherresearchproduct)
case _ =>
}
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
@ -393,7 +413,20 @@ object ScholixUtils extends Serializable {
}
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
val authors: List[AuthorPid] = r.getAuthor.asScala
.map(a => {
var ORCID: String = null;
if (a.getPid != null) {
val result = a.getPid.asScala.find(p =>
p.getQualifier != null && p.getQualifier.getClassid != null && p.getQualifier.getClassid.toLowerCase
.contains("orcid")
)
if (result.isDefined)
ORCID = result.get.getValue
}
new AuthorPid(a.getFullname, ORCID)
})
.toList
if (authors.nonEmpty)
s.setAuthor(authors.asJava)
}

View File

@ -58,7 +58,7 @@ object SparkConvertObjectToJson {
case "scholix" =>
log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
// val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) {
val mp = toInt(maxPidNumberFilter).get
d

View File

@ -34,7 +34,6 @@ object SparkCreateSummaryObject {
log.info(s"targetPath -> $targetPath")
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]

View File

@ -50,7 +50,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/summaries_json</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/index/summaries_json</arg>
<arg>--index</arg><arg>${index}_object</arg>
<arg>--idPath</arg><arg>id</arg>
<arg>--cluster</arg><arg>${esCluster}</arg>
@ -76,7 +76,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/scholix_json</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/index/scholix_json</arg>
<arg>--index</arg><arg>${index}_scholix</arg>
<arg>--idPath</arg><arg>identifier</arg>
<arg>--cluster</arg><arg>${esCluster}</arg>

View File

@ -11,9 +11,13 @@
}
},
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "nested",
"properties": {
"fullname": {
"type": "keyword",
"ignore_above": 256
},
"orcid": {
"type": "keyword",
"ignore_above": 256
}