[webcrawl] the blacklist is now in json and no more in csv after the normalization process
This commit is contained in:
parent
fc60661ac5
commit
7cff281d3e
|
@ -105,7 +105,7 @@ public class RemoveRelationFromActionSet
|
||||||
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
|
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Row, String>) r -> IdentifierFactory
|
(MapFunction<Row, String>) r -> IdentifierFactory
|
||||||
.idFromPid("50", "doi", ((String) r.getAs("DOI / PMID")).substring(16), true),
|
.idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
|
||||||
Encoders.STRING());
|
Encoders.STRING());
|
||||||
|
|
||||||
// read the old actionset and get the relations in the payload
|
// read the old actionset and get the relations in the payload
|
||||||
|
@ -151,9 +151,8 @@ public class RemoveRelationFromActionSet
|
||||||
|
|
||||||
return spark
|
return spark
|
||||||
.read()
|
.read()
|
||||||
.option("header", true)
|
.json(inputPath)
|
||||||
.csv(inputPath)
|
.select("doi");
|
||||||
.select("DOI / PMID");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue