1
0
Fork 0

[BipFinder] Fixed issue for wrong escaped char in doi

This commit is contained in:
Miriam Baglioni 2022-10-03 12:42:52 +02:00
parent 89f7007080
commit 188f25eefa
3 changed files with 10 additions and 4 deletions

View File

@ -11,6 +11,7 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.fasterxml.jackson.core.JsonParser;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -40,7 +41,6 @@ import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareBipFinder implements Serializable { public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class); private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -82,9 +82,11 @@ public class PrepareBipFinder implements Serializable {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
ObjectMapper mapper = new ObjectMapper()
.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true);
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(inputPath) .textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); .map(item -> mapper.readValue(item, BipDeserialize.class));
spark spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {

View File

@ -88,7 +88,7 @@ public class PrepareTest {
.textFile(workingDir.toString() + "/work/bip") .textFile(workingDir.toString() + "/work/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class)); .map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(86, tmp.count()); Assertions.assertEquals(87, tmp.count());
String doi1 = "unresolved::10.0000/096020199389707::doi"; String doi1 = "unresolved::10.0000/096020199389707::doi";
@ -151,6 +151,9 @@ public class PrepareTest {
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count());
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size()); Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size());
tmp.filter(r -> r.getId().startsWith("unresolved::10.2111/1551-5028(2004)057"))
.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
} }
@Test @Test

View File

@ -83,4 +83,5 @@
{"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} {"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.2111/1551-5028(2004)057\[0539:sdsocg\]2.0.co;2": [{"id":"influence", "unit":[{"key":"score","value":"6.3290875E-9"},{"key":"class","value":"C"}]}, {"id":"popularity", "unit":[{"key":"score","value":"6.576763E-9"},{"key":"class","value":"C"}]}, {"id":"influence_alt", "unit":[{"key":"score","value":"11"},{"key":"class","value":"C"}]}, {"id":"popularity_alt", "unit":[{"key":"score","value":"1.0142108"},{"key":"class","value":"C"}]}, {"id":"impulse", "unit":[{"key":"score","value":"1"},{"key":"class","value":"C"}]}]}