[webcrawl] the blacklist is now in json and no more in csv after the normalization process
This commit is contained in:
parent
fc60661ac5
commit
7cff281d3e
|
@ -105,7 +105,7 @@ public class RemoveRelationFromActionSet
|
|||
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
|
||||
.map(
|
||||
(MapFunction<Row, String>) r -> IdentifierFactory
|
||||
.idFromPid("50", "doi", ((String) r.getAs("DOI / PMID")).substring(16), true),
|
||||
.idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
|
||||
Encoders.STRING());
|
||||
|
||||
// read the old actionset and get the relations in the payload
|
||||
|
@ -151,9 +151,8 @@ public class RemoveRelationFromActionSet
|
|||
|
||||
return spark
|
||||
.read()
|
||||
.option("header", true)
|
||||
.csv(inputPath)
|
||||
.select("DOI / PMID");
|
||||
.json(inputPath)
|
||||
.select("doi");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue