111 lines
3.9 KiB
Scala
111 lines
3.9 KiB
Scala
package eu.dnetlib.dhp.sx.graph.pangaea
|
|
|
|
import org.apache.spark.sql.expressions.Aggregator
|
|
import org.apache.spark.sql.{Encoder, Encoders}
|
|
import org.json4s
|
|
import org.json4s.DefaultFormats
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
import java.util.regex.Pattern
|
|
import scala.language.postfixOps
|
|
import scala.xml.{Elem, Node, XML}
|
|
|
|
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
|
|
publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
|
|
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
|
|
|
|
object PangaeaUtils {
|
|
|
|
|
|
def toDataset(input:String):PangaeaDataModel = {
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
lazy val json: json4s.JValue = parse(input)
|
|
val xml= (json \ "xml").extract[String]
|
|
parseXml(xml)
|
|
}
|
|
|
|
def findDOIInRelation( input:List[String]):List[String] = {
|
|
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
|
input.map(i => {
|
|
val matcher = pattern.matcher(i)
|
|
if (matcher.find())
|
|
matcher.group(0)
|
|
else
|
|
null
|
|
}).filter(i => i!= null)
|
|
}
|
|
|
|
def attributeOpt(attribute: String, node:Node): Option[String] =
|
|
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
|
|
|
def extractLinkage(node:Elem):List[(String, String)] = {
|
|
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
|
|
}
|
|
|
|
def parseXml(input:String):PangaeaDataModel = {
|
|
val xml = XML.loadString(input)
|
|
|
|
val identifier = (xml \ "identifier").text
|
|
val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
|
val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
|
val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
|
val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
|
val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
|
val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
|
val language= (xml \ "language").text
|
|
val rights= (xml \ "rights").text
|
|
val parentIdentifier= (xml \ "parentIdentifier").text
|
|
val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
|
val relationFiltered = findDOIInRelation(relation)
|
|
val linkage:List[(String,String)] = extractLinkage(xml)
|
|
|
|
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
|
|
}
|
|
|
|
|
|
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] = new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel]{
|
|
|
|
|
|
override def zero: PangaeaDataModel = null
|
|
|
|
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
|
|
if (b == null)
|
|
a._2
|
|
else {
|
|
if (a == null)
|
|
b
|
|
else {
|
|
if (b.title != null && b.title.nonEmpty)
|
|
b
|
|
else
|
|
a._2
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
|
|
if (b1 == null)
|
|
b2
|
|
else {
|
|
if (b2 == null)
|
|
b1
|
|
else {
|
|
if (b1.title != null && b1.title.nonEmpty)
|
|
b1
|
|
else
|
|
b2
|
|
|
|
}
|
|
}
|
|
}
|
|
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
|
|
|
|
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
|
|
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
}
|
|
|
|
|
|
|
|
|
|
} |