minor improvements in the aggregation of PubMed baseline

This commit is contained in:
Claudio Atzori 2024-08-07 14:22:30 +02:00
parent b8bc237079
commit c06ef1d68f
2 changed files with 12 additions and 2 deletions

View File

@ -60,8 +60,9 @@ object SparkCreateBaselineDataFrame {
.setSocketTimeout(timeout * 1000)
.build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
println(s"Downloading ${url}")
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
println(s"got response with status: ${response.getStatusLine.getStatusCode}")
response.getEntity.getContent
}
@ -125,7 +126,7 @@ object SparkCreateBaselineDataFrame {
val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
val i = downloadBaselinePart(u._2)
IOUtils.copy(i, fsDataOutputStream)
println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
println(s"Saved file ${u._2} in path $baselinePath/${u._1}")
fsDataOutputStream.close()
}
@ -217,6 +218,8 @@ object SparkCreateBaselineDataFrame {
.save(s"$workingPath/baseline_dataset")
}
log.info(s"saved dataset: '$workingPath/baseline_dataset'")
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
CollectionUtils.saveDataset(
exported_dataset

View File

@ -309,6 +309,13 @@ object PubMedToOaf {
} else
result.setOriginalId(pidList.map(s => s.getValue).asJava)
if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid != null) {
result.getOriginalId.add(IdentifierFactory.idFromPid("50", PidType.doi.toString, normalizedPid, true))
}
}
result.setId(article.getPmid)
// END RESULT MAPPING