fixed DownloadCSV parameters spec; workflow patching the hostedby replaces the graph content (publication, datasource) rather than creating a copy
This commit is contained in:
parent
c3ad4ab701
commit
7ee2757fcd
|
@ -27,8 +27,8 @@ object SparkApplyHostedByMapToDatasource {
|
|||
d
|
||||
})(Encoders.bean((classOf[Datasource])))
|
||||
}
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
|
@ -41,18 +41,15 @@ object SparkApplyHostedByMapToDatasource {
|
|||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val graphPath = parser.get("graphPath")
|
||||
|
||||
val outputPath = parser.get("outputPath")
|
||||
val preparedInfoPath = parser.get("preparedInfoPath")
|
||||
|
||||
|
||||
implicit val formats = DefaultFormats
|
||||
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Datasource] = Encoders.bean(classOf[Datasource])
|
||||
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
|
||||
|
@ -62,6 +59,12 @@ object SparkApplyHostedByMapToDatasource {
|
|||
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
|
||||
|
||||
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
||||
|
||||
spark.read.textFile(outputPath)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.text(graphPath + "/datasource")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -75,8 +75,11 @@ object SparkApplyHostedByMapToResult {
|
|||
|
||||
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
||||
|
||||
|
||||
|
||||
spark.read.textFile(outputPath)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.text(graphPath + "/publication")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -11,6 +11,12 @@
|
|||
"paramDescription": "the path where to find the pre-processed data for unibi gold list and doj artciles",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"of",
|
||||
"paramLongName":"outputFile",
|
||||
"paramDescription": "the output json file produced by the CSV downlaod procedure",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "hnn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
|
|
Loading…
Reference in New Issue