minor improvements in the aggregation of PubMed baseline
This commit is contained in:
parent
b8bc237079
commit
c06ef1d68f
|
@ -60,8 +60,9 @@ object SparkCreateBaselineDataFrame {
|
|||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
println(s"Downloading ${url}")
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
println(s"got response with status: ${response.getStatusLine.getStatusCode}")
|
||||
response.getEntity.getContent
|
||||
|
||||
}
|
||||
|
@ -125,7 +126,7 @@ object SparkCreateBaselineDataFrame {
|
|||
val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
|
||||
val i = downloadBaselinePart(u._2)
|
||||
IOUtils.copy(i, fsDataOutputStream)
|
||||
println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
|
||||
println(s"Saved file ${u._2} in path $baselinePath/${u._1}")
|
||||
fsDataOutputStream.close()
|
||||
}
|
||||
|
||||
|
@ -217,6 +218,8 @@ object SparkCreateBaselineDataFrame {
|
|||
.save(s"$workingPath/baseline_dataset")
|
||||
}
|
||||
|
||||
log.info(s"saved dataset: '$workingPath/baseline_dataset'")
|
||||
|
||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||
CollectionUtils.saveDataset(
|
||||
exported_dataset
|
||||
|
|
|
@ -309,6 +309,13 @@ object PubMedToOaf {
|
|||
} else
|
||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||
|
||||
if (article.getDoi != null) {
|
||||
val normalizedPid = cleanDoi(article.getDoi)
|
||||
if (normalizedPid != null) {
|
||||
result.getOriginalId.add(IdentifierFactory.idFromPid("50", PidType.doi.toString, normalizedPid, true))
|
||||
}
|
||||
}
|
||||
|
||||
result.setId(article.getPmid)
|
||||
|
||||
// END RESULT MAPPING
|
||||
|
|
Loading…
Reference in New Issue