[webcrawl] the blacklist is now in json and no more in csv after the normalization process

This commit is contained in:
Miriam Baglioni 2024-07-25 15:16:42 +02:00
parent fc60661ac5
commit 7cff281d3e
1 changed files with 3 additions and 4 deletions

View File

@ -105,7 +105,7 @@ public class RemoveRelationFromActionSet
Dataset<String> blackList = readBlackList(spark, blackListInputPath) Dataset<String> blackList = readBlackList(spark, blackListInputPath)
.map( .map(
(MapFunction<Row, String>) r -> IdentifierFactory (MapFunction<Row, String>) r -> IdentifierFactory
.idFromPid("50", "doi", ((String) r.getAs("DOI / PMID")).substring(16), true), .idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
Encoders.STRING()); Encoders.STRING());
// read the old actionset and get the relations in the payload // read the old actionset and get the relations in the payload
@ -151,9 +151,8 @@ public class RemoveRelationFromActionSet
return spark return spark
.read() .read()
.option("header", true) .json(inputPath)
.csv(inputPath) .select("doi");
.select("DOI / PMID");
} }
} }