2023-11-24 12:39:58 +01:00
|
|
|
package eu.dnetlib.dhp.enrich.orcid
|
2023-11-30 14:36:50 +01:00
|
|
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
2023-11-29 11:17:58 +01:00
|
|
|
import org.apache.spark.sql.{Column, Encoder, Encoders, Row, SparkSession}
|
2023-11-24 12:39:58 +01:00
|
|
|
import org.junit.jupiter.api.Test
|
|
|
|
import org.slf4j.{Logger, LoggerFactory}
|
2023-11-29 11:17:58 +01:00
|
|
|
import org.apache.spark.sql.functions._
|
|
|
|
|
2023-11-24 12:39:58 +01:00
|
|
|
class EnrichOrcidTest {
|
|
|
|
|
|
|
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
|
|
|
|
2023-11-29 11:17:58 +01:00
|
|
|
def test() = {
|
|
|
|
val spark = SparkSession.builder().master("local[*]").getOrCreate()
|
2023-11-30 14:36:50 +01:00
|
|
|
// spark.sparkContext.setLogLevel("ERROR")
|
|
|
|
|
|
|
|
// new SparkEnrichGraphWithOrcidAuthors(null, null, null)
|
|
|
|
// .enrichResult(
|
|
|
|
// spark,
|
|
|
|
// "/Users/sandro/orcid_test/publication",
|
|
|
|
// "",
|
|
|
|
// "/tmp/graph/",
|
|
|
|
// Encoders.bean(classOf[Publication])
|
|
|
|
// )
|
2023-11-29 11:17:58 +01:00
|
|
|
|
|
|
|
val schema = Encoders.bean(classOf[Publication]).schema
|
2023-11-30 14:36:50 +01:00
|
|
|
//
|
|
|
|
// val simplifyAuthor = udf((r: Seq[Row]) => {
|
|
|
|
// r
|
|
|
|
// .map(k =>
|
|
|
|
// AuthorPid(
|
|
|
|
// k.getAs[String]("fullname"),
|
|
|
|
// k.getAs[Seq[Row]]("pid")
|
|
|
|
// .map(p => Pid(p.getAs[Row]("qualifier").getAs[String]("classid"), p.getAs[String]("value")))
|
|
|
|
// .toList
|
|
|
|
// )
|
|
|
|
// )
|
|
|
|
// .filter(l => l.pids.nonEmpty)
|
|
|
|
// .toList
|
|
|
|
// })
|
|
|
|
//
|
|
|
|
// val wrong_orcid_intersection = udf((a: Seq[Row]) => {
|
|
|
|
// a.map(author => {
|
|
|
|
// val pids_with_orcid: Seq[Row] = author
|
|
|
|
// .getAs[Seq[Row]]("pids")
|
|
|
|
// .filter(p =>
|
|
|
|
// p.getAs[String]("pidScheme") != null && p.getAs[String]("pidScheme").toLowerCase.contains("orcid")
|
|
|
|
// )
|
|
|
|
// if (pids_with_orcid.exists(p => p.getAs[String]("pidScheme").equals("ORCID"))) {
|
|
|
|
// if (pids_with_orcid.map(p => p.getAs[String]("pidValue").toLowerCase).distinct.size > 1) {
|
|
|
|
// AuthorPid(
|
|
|
|
// author.getAs[String]("fullName"),
|
|
|
|
// pids_with_orcid.map(p => Pid(p.getAs[String]("pidScheme"), p.getAs[String]("pidValue"))).toList
|
|
|
|
// )
|
|
|
|
//
|
|
|
|
// } else
|
|
|
|
// null
|
|
|
|
// } else
|
|
|
|
// null
|
|
|
|
// }).filter(author => author != null)
|
|
|
|
// })
|
|
|
|
|
2023-11-30 15:31:42 +01:00
|
|
|
Encoders
|
2023-11-30 14:36:50 +01:00
|
|
|
import spark.implicits._
|
|
|
|
|
|
|
|
// val enriched = spark.read
|
|
|
|
// .schema(schema)
|
|
|
|
// .json("/Users/sandro/orcid_test/publication_enriched")
|
|
|
|
// .select(col("id"), explode(col("author")).as("authors"))
|
|
|
|
// .withColumn("ap", col("authors.pid.qualifier.classid"))
|
|
|
|
// .withColumn("dp", col("authors.pid.datainfo.provenanceAction.classid"))
|
|
|
|
//
|
|
|
|
// .show()
|
2023-11-29 11:17:58 +01:00
|
|
|
|
2023-11-30 14:36:50 +01:00
|
|
|
}
|
|
|
|
|
2023-11-24 12:39:58 +01:00
|
|
|
}
|