2021-06-30 11:09:30 +02:00
|
|
|
package eu.dnetlib.dhp.sx.graph.pangaea
|
2021-04-27 11:30:37 +02:00
|
|
|
|
|
|
|
import org.apache.spark.sql.expressions.Aggregator
|
|
|
|
import org.apache.spark.sql.{Encoder, Encoders}
|
|
|
|
import org.json4s
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.jackson.JsonMethods.parse
|
2021-12-06 13:57:41 +01:00
|
|
|
|
2021-04-28 10:21:03 +02:00
|
|
|
import java.util.regex.Pattern
|
|
|
|
import scala.language.postfixOps
|
|
|
|
import scala.xml.{Elem, Node, XML}
|
2021-04-27 11:30:37 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
case class PangaeaDataModel(
|
|
|
|
identifier: String,
|
|
|
|
title: List[String],
|
|
|
|
objectType: List[String],
|
|
|
|
creator: List[String],
|
|
|
|
publisher: List[String],
|
|
|
|
dataCenter: List[String],
|
|
|
|
subject: List[String],
|
|
|
|
language: String,
|
|
|
|
rights: String,
|
|
|
|
parent: String,
|
|
|
|
relation: List[String],
|
|
|
|
linkage: List[(String, String)]
|
|
|
|
) {}
|
2021-04-27 11:30:37 +02:00
|
|
|
|
|
|
|
object PangaeaUtils {
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def toDataset(input: String): PangaeaDataModel = {
|
2021-04-27 11:30:37 +02:00
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json: json4s.JValue = parse(input)
|
2022-01-11 16:57:48 +01:00
|
|
|
val xml = (json \ "xml").extract[String]
|
2021-04-28 10:21:03 +02:00
|
|
|
parseXml(xml)
|
|
|
|
}
|
2021-04-27 11:30:37 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def findDOIInRelation(input: List[String]): List[String] = {
|
2021-04-28 10:21:03 +02:00
|
|
|
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
2022-01-11 16:57:48 +01:00
|
|
|
input
|
|
|
|
.map(i => {
|
|
|
|
val matcher = pattern.matcher(i)
|
|
|
|
if (matcher.find())
|
|
|
|
matcher.group(0)
|
|
|
|
else
|
|
|
|
null
|
|
|
|
})
|
|
|
|
.filter(i => i != null)
|
2021-04-28 10:21:03 +02:00
|
|
|
}
|
2021-04-27 11:30:37 +02:00
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def attributeOpt(attribute: String, node: Node): Option[String] =
|
2021-04-28 10:21:03 +02:00
|
|
|
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def extractLinkage(node: Elem): List[(String, String)] = {
|
|
|
|
(node \ "linkage")
|
|
|
|
.map(n => (attributeOpt("type", n), n.text))
|
|
|
|
.filter(t => t._1.isDefined)
|
|
|
|
.map(t => (t._1.get, t._2))(collection.breakOut)
|
2021-04-28 10:21:03 +02:00
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
def parseXml(input: String): PangaeaDataModel = {
|
2021-04-28 10:21:03 +02:00
|
|
|
val xml = XML.loadString(input)
|
|
|
|
|
|
|
|
val identifier = (xml \ "identifier").text
|
2022-01-11 16:57:48 +01:00
|
|
|
val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
|
|
|
val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
|
|
|
val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
|
|
|
val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
|
|
|
val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
|
|
|
val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
|
|
|
val language = (xml \ "language").text
|
|
|
|
val rights = (xml \ "rights").text
|
|
|
|
val parentIdentifier = (xml \ "parentIdentifier").text
|
|
|
|
val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
2021-04-28 10:21:03 +02:00
|
|
|
val relationFiltered = findDOIInRelation(relation)
|
2022-01-11 16:57:48 +01:00
|
|
|
val linkage: List[(String, String)] = extractLinkage(xml)
|
|
|
|
|
|
|
|
PangaeaDataModel(
|
|
|
|
identifier,
|
|
|
|
title,
|
|
|
|
pType,
|
|
|
|
creators,
|
|
|
|
publisher,
|
|
|
|
dataCenter,
|
|
|
|
subject,
|
|
|
|
language,
|
|
|
|
rights,
|
|
|
|
parentIdentifier,
|
|
|
|
relationFiltered,
|
|
|
|
linkage
|
|
|
|
)
|
2021-04-27 11:30:37 +02:00
|
|
|
}
|
|
|
|
|
2022-01-12 09:40:28 +01:00
|
|
|
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
2022-01-11 16:57:48 +01:00
|
|
|
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
2021-04-27 11:30:37 +02:00
|
|
|
|
|
|
|
override def zero: PangaeaDataModel = null
|
|
|
|
|
|
|
|
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
|
|
|
|
if (b == null)
|
|
|
|
a._2
|
|
|
|
else {
|
|
|
|
if (a == null)
|
|
|
|
b
|
|
|
|
else {
|
2021-04-28 10:21:03 +02:00
|
|
|
if (b.title != null && b.title.nonEmpty)
|
2021-04-27 11:30:37 +02:00
|
|
|
b
|
2022-01-11 16:57:48 +01:00
|
|
|
else
|
2021-04-27 11:30:37 +02:00
|
|
|
a._2
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
|
|
|
|
if (b1 == null)
|
|
|
|
b2
|
|
|
|
else {
|
|
|
|
if (b2 == null)
|
|
|
|
b1
|
|
|
|
else {
|
2021-04-28 10:21:03 +02:00
|
|
|
if (b1.title != null && b1.title.nonEmpty)
|
2021-04-27 11:30:37 +02:00
|
|
|
b1
|
|
|
|
else
|
|
|
|
b2
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
|
|
|
|
|
|
|
|
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
|
|
|
|
|
|
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
|
|
}
|
|
|
|
|
2022-01-11 16:57:48 +01:00
|
|
|
}
|