From c06ef1d68f6b2c1746c51a2e886a607d85812967 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 7 Aug 2024 14:22:30 +0200 Subject: [PATCH] minor improvements in the aggregation of PubMed baseline --- .../dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala | 7 +++++-- .../scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 639918151..cb34e1eae 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -60,8 +60,9 @@ object SparkCreateBaselineDataFrame { .setSocketTimeout(timeout * 1000) .build() val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() + println(s"Downloading ${url}") val response = client.execute(r) - println(s"get response with status${response.getStatusLine.getStatusCode}") + println(s"got response with status: ${response.getStatusLine.getStatusCode}") response.getEntity.getContent } @@ -125,7 +126,7 @@ object SparkCreateBaselineDataFrame { val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true) val i = downloadBaselinePart(u._2) IOUtils.copy(i, fsDataOutputStream) - println(s"Downloaded ${u._2} into $baselinePath/${u._1}") + println(s"Saved file ${u._2} in path $baselinePath/${u._1}") fsDataOutputStream.close() } @@ -217,6 +218,8 @@ object SparkCreateBaselineDataFrame { .save(s"$workingPath/baseline_dataset") } + log.info(s"saved dataset: '$workingPath/baseline_dataset'") + val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] CollectionUtils.saveDataset( exported_dataset diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index d59d73bd0..d6726ef84 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -309,6 +309,13 @@ object PubMedToOaf { } else result.setOriginalId(pidList.map(s => s.getValue).asJava) + if (article.getDoi != null) { + val normalizedPid = cleanDoi(article.getDoi) + if (normalizedPid != null) { + result.getOriginalId.add(IdentifierFactory.idFromPid("50", PidType.doi.toString, normalizedPid, true)) + } + } + result.setId(article.getPmid) // END RESULT MAPPING