forked from D-Net/dnet-hadoop
added params skip update to direct transform in OAF, this should be set to true in production
This commit is contained in:
parent
511da98d0c
commit
7387416e90
|
@ -32,7 +32,7 @@ object SparkCreateBaselineDataFrame {
|
|||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, (end - start))
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
||||
|
@ -158,6 +158,9 @@ object SparkCreateBaselineDataFrame {
|
|||
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||
log.info("hdfsServerUri: {}", targetPath)
|
||||
|
||||
val skipUpdate = parser.get("skipUpdate")
|
||||
log.info("skipUpdate: {}", skipUpdate)
|
||||
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
|
@ -176,18 +179,17 @@ object SparkCreateBaselineDataFrame {
|
|||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
|
||||
}))
|
||||
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
}))
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
}
|
||||
|
||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||
exported_dataset
|
||||
|
|
|
@ -3,5 +3,6 @@
|
|||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
]
|
|
@ -12,6 +12,11 @@
|
|||
<name>targetPath</name>
|
||||
<description>The target path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>skipUpdate</name>
|
||||
<value>false</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDataset"/>
|
||||
|
@ -42,6 +47,7 @@
|
|||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue