dnet-hadoop/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/pangaea/PangaeaUtils.scala

137 lines
4.0 KiB
Scala

package eu.dnetlib.dhp.sx.graph.pangaea
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import java.util.regex.Pattern
import scala.language.postfixOps
import scala.xml.{Elem, Node, XML}
case class PangaeaDataModel(
identifier: String,
title: List[String],
objectType: List[String],
creator: List[String],
publisher: List[String],
dataCenter: List[String],
subject: List[String],
language: String,
rights: String,
parent: String,
relation: List[String],
linkage: List[(String, String)]
) {}
object PangaeaUtils {
def toDataset(input: String): PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val xml = (json \ "xml").extract[String]
parseXml(xml)
}
def findDOIInRelation(input: List[String]): List[String] = {
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
input
.map(i => {
val matcher = pattern.matcher(i)
if (matcher.find())
matcher.group(0)
else
null
})
.filter(i => i != null)
}
def attributeOpt(attribute: String, node: Node): Option[String] =
node.attribute(attribute) flatMap (_.headOption) map (_.text)
def extractLinkage(node: Elem): List[(String, String)] = {
(node \ "linkage")
.map(n => (attributeOpt("type", n), n.text))
.filter(t => t._1.isDefined)
.map(t => (t._1.get, t._2))(collection.breakOut)
}
def parseXml(input: String): PangaeaDataModel = {
val xml = XML.loadString(input)
val identifier = (xml \ "identifier").text
val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
val language = (xml \ "language").text
val rights = (xml \ "rights").text
val parentIdentifier = (xml \ "parentIdentifier").text
val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
val relationFiltered = findDOIInRelation(relation)
val linkage: List[(String, String)] = extractLinkage(xml)
PangaeaDataModel(
identifier,
title,
pType,
creators,
publisher,
dataCenter,
subject,
language,
rights,
parentIdentifier,
relationFiltered,
linkage
)
}
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
override def zero: PangaeaDataModel = null
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
if (b == null)
a._2
else {
if (a == null)
b
else {
if (b.title != null && b.title.nonEmpty)
b
else
a._2
}
}
}
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
if (b1 == null)
b2
else {
if (b2 == null)
b1
else {
if (b1.title != null && b1.title.nonEmpty)
b1
else
b2
}
}
}
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
}
}