forked from D-Net/dnet-hadoop
added better filter for openCitations
This commit is contained in:
parent
77bc9863e9
commit
ca8d26bcb4
|
@ -7,6 +7,7 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkConvertRDDtoDataset {
|
object SparkConvertRDDtoDataset {
|
||||||
|
|
||||||
|
@ -94,21 +95,29 @@ object SparkConvertRDDtoDataset {
|
||||||
log.info("Converting Relation")
|
log.info("Converting Relation")
|
||||||
|
|
||||||
val relationSemanticFilter = List(
|
val relationSemanticFilter = List(
|
||||||
"cites",
|
// "cites",
|
||||||
"iscitedby",
|
// "iscitedby",
|
||||||
"merges",
|
"merges",
|
||||||
"ismergedin",
|
"ismergedin",
|
||||||
"HasAmongTopNSimilarDocuments",
|
"HasAmongTopNSimilarDocuments",
|
||||||
"IsAmongTopNSimilarDocuments"
|
"IsAmongTopNSimilarDocuments"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
val rddRelation = spark.sparkContext
|
val rddRelation = spark.sparkContext
|
||||||
.textFile(s"$sourcePath/relation")
|
.textFile(s"$sourcePath/relation")
|
||||||
.map(s => mapper.readValue(s, classOf[Relation]))
|
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||||
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
|
||||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||||
|
//filter OpenCitations relations
|
||||||
|
.filter(r => r.getCollectedfrom!= null && r.getCollectedfrom.size()>0 && !r.getCollectedfrom.asScala.exists(k => "opencitations".equalsIgnoreCase(k.getValue)))
|
||||||
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
||||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,136 +0,0 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.pangaea
|
|
||||||
|
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
|
||||||
import org.apache.spark.sql.{Encoder, Encoders}
|
|
||||||
import org.json4s
|
|
||||||
import org.json4s.DefaultFormats
|
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
|
||||||
|
|
||||||
import java.util.regex.Pattern
|
|
||||||
import scala.language.postfixOps
|
|
||||||
import scala.xml.{Elem, Node, XML}
|
|
||||||
|
|
||||||
case class PangaeaDataModel(
|
|
||||||
identifier: String,
|
|
||||||
title: List[String],
|
|
||||||
objectType: List[String],
|
|
||||||
creator: List[String],
|
|
||||||
publisher: List[String],
|
|
||||||
dataCenter: List[String],
|
|
||||||
subject: List[String],
|
|
||||||
language: String,
|
|
||||||
rights: String,
|
|
||||||
parent: String,
|
|
||||||
relation: List[String],
|
|
||||||
linkage: List[(String, String)]
|
|
||||||
) {}
|
|
||||||
|
|
||||||
object PangaeaUtils {
|
|
||||||
|
|
||||||
def toDataset(input: String): PangaeaDataModel = {
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
||||||
lazy val json: json4s.JValue = parse(input)
|
|
||||||
val xml = (json \ "xml").extract[String]
|
|
||||||
parseXml(xml)
|
|
||||||
}
|
|
||||||
|
|
||||||
def findDOIInRelation(input: List[String]): List[String] = {
|
|
||||||
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
|
|
||||||
input
|
|
||||||
.map(i => {
|
|
||||||
val matcher = pattern.matcher(i)
|
|
||||||
if (matcher.find())
|
|
||||||
matcher.group(0)
|
|
||||||
else
|
|
||||||
null
|
|
||||||
})
|
|
||||||
.filter(i => i != null)
|
|
||||||
}
|
|
||||||
|
|
||||||
def attributeOpt(attribute: String, node: Node): Option[String] =
|
|
||||||
node.attribute(attribute) flatMap (_.headOption) map (_.text)
|
|
||||||
|
|
||||||
def extractLinkage(node: Elem): List[(String, String)] = {
|
|
||||||
(node \ "linkage")
|
|
||||||
.map(n => (attributeOpt("type", n), n.text))
|
|
||||||
.filter(t => t._1.isDefined)
|
|
||||||
.map(t => (t._1.get, t._2))(collection.breakOut)
|
|
||||||
}
|
|
||||||
|
|
||||||
def parseXml(input: String): PangaeaDataModel = {
|
|
||||||
val xml = XML.loadString(input)
|
|
||||||
|
|
||||||
val identifier = (xml \ "identifier").text
|
|
||||||
val title: List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
|
|
||||||
val pType: List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
|
|
||||||
val creators: List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
|
|
||||||
val publisher: List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
|
|
||||||
val dataCenter: List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
|
|
||||||
val subject: List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
|
|
||||||
val language = (xml \ "language").text
|
|
||||||
val rights = (xml \ "rights").text
|
|
||||||
val parentIdentifier = (xml \ "parentIdentifier").text
|
|
||||||
val relation: List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
|
|
||||||
val relationFiltered = findDOIInRelation(relation)
|
|
||||||
val linkage: List[(String, String)] = extractLinkage(xml)
|
|
||||||
|
|
||||||
PangaeaDataModel(
|
|
||||||
identifier,
|
|
||||||
title,
|
|
||||||
pType,
|
|
||||||
creators,
|
|
||||||
publisher,
|
|
||||||
dataCenter,
|
|
||||||
subject,
|
|
||||||
language,
|
|
||||||
rights,
|
|
||||||
parentIdentifier,
|
|
||||||
relationFiltered,
|
|
||||||
linkage
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
def getDatasetAggregator(): Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] =
|
|
||||||
new Aggregator[(String, PangaeaDataModel), PangaeaDataModel, PangaeaDataModel] {
|
|
||||||
|
|
||||||
override def zero: PangaeaDataModel = null
|
|
||||||
|
|
||||||
override def reduce(b: PangaeaDataModel, a: (String, PangaeaDataModel)): PangaeaDataModel = {
|
|
||||||
if (b == null)
|
|
||||||
a._2
|
|
||||||
else {
|
|
||||||
if (a == null)
|
|
||||||
b
|
|
||||||
else {
|
|
||||||
if (b.title != null && b.title.nonEmpty)
|
|
||||||
b
|
|
||||||
else
|
|
||||||
a._2
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
override def merge(b1: PangaeaDataModel, b2: PangaeaDataModel): PangaeaDataModel = {
|
|
||||||
if (b1 == null)
|
|
||||||
b2
|
|
||||||
else {
|
|
||||||
if (b2 == null)
|
|
||||||
b1
|
|
||||||
else {
|
|
||||||
if (b1.title != null && b1.title.nonEmpty)
|
|
||||||
b1
|
|
||||||
else
|
|
||||||
b2
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
override def finish(reduction: PangaeaDataModel): PangaeaDataModel = reduction
|
|
||||||
|
|
||||||
override def bufferEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
||||||
|
|
||||||
override def outputEncoder: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,58 +0,0 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.pangaea
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
import scala.io.Source
|
|
||||||
|
|
||||||
object SparkGeneratePanagaeaDataset {
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
||||||
val conf: SparkConf = new SparkConf()
|
|
||||||
val parser = new ArgumentApplicationParser(
|
|
||||||
Source
|
|
||||||
.fromInputStream(
|
|
||||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/pangaea/pangaea_to_dataset.json")
|
|
||||||
)
|
|
||||||
.mkString
|
|
||||||
)
|
|
||||||
parser.parseArgument(args)
|
|
||||||
|
|
||||||
val spark: SparkSession =
|
|
||||||
SparkSession
|
|
||||||
.builder()
|
|
||||||
.config(conf)
|
|
||||||
.appName(SparkGeneratePanagaeaDataset.getClass.getSimpleName)
|
|
||||||
.master(parser.get("master"))
|
|
||||||
.getOrCreate()
|
|
||||||
|
|
||||||
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
|
|
||||||
logger.info("Converting sequential file into Dataset")
|
|
||||||
val sc: SparkContext = spark.sparkContext
|
|
||||||
|
|
||||||
val workingPath: String = parser.get("workingPath")
|
|
||||||
|
|
||||||
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
|
|
||||||
|
|
||||||
val inputRDD: RDD[PangaeaDataModel] =
|
|
||||||
sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
|
|
||||||
|
|
||||||
spark
|
|
||||||
.createDataset(inputRDD)
|
|
||||||
.as[PangaeaDataModel]
|
|
||||||
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
|
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
|
||||||
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
|
|
||||||
.map(s => s._2)
|
|
||||||
.write
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.save(s"$workingPath/dataset")
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,29 +0,0 @@
|
||||||
package eu.dnetlib.dhp.sx.pangaea
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils
|
|
||||||
import org.junit.jupiter.api.Test
|
|
||||||
|
|
||||||
import java.util.TimeZone
|
|
||||||
import java.text.SimpleDateFormat
|
|
||||||
import java.util.Date
|
|
||||||
import scala.io.Source
|
|
||||||
class PangaeaTransformTest {
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
def test_dateStamp() :Unit ={
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val d = new Date()
|
|
||||||
|
|
||||||
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format d}Z"
|
|
||||||
println(s)
|
|
||||||
|
|
||||||
|
|
||||||
val xml = Source.fromInputStream(getClass.getResourceAsStream("input.xml")).mkString
|
|
||||||
println(PangaeaUtils.parseXml(xml))
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue