From c7f6669f1a8fc9596c76b242f041249c4c778b2c Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 25 Jul 2024 15:20:18 +0200 Subject: [PATCH] [webcrawl] the blacklist is now in json and no more in csv after the normalization process --- .../actionmanager/webcrawl/CreateActionSetFromWebEntries.java | 3 +-- .../dhp/actionmanager/webcrawl/blackListRemove/not_irish.csv | 2 -- .../dhp/actionmanager/webcrawl/blackListRemove/not_irish.json | 1 + 3 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.csv create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.json diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java index 7607cfc76c..b5aed6ea2c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java @@ -148,8 +148,7 @@ public class CreateActionSetFromWebEntries implements Serializable { return spark .read() - .option("header", true) - .csv(inputPath) + .json(inputPath) .select("OpenAlexId"); } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.csv b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.csv deleted file mode 100644 index 009925839d..0000000000 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.csv +++ /dev/null @@ -1,2 +0,0 @@ -DOI / PMID,OpenAlexId,Comments, -https://doi.org/10.1098/rstl.1684.0023,https://openalex.org/W2124362779,, \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.json new file mode 100644 index 0000000000..2c470c5550 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/not_irish.json @@ -0,0 +1 @@ +{"doi":"https://doi.org/10.1098/rstl.1684.0023","OpenAlexId":"https://openalex.org/W2124362779"} \ No newline at end of file