From 3cd5590f3b64d0b847864e5df23460fbca67aaea Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 28 Feb 2024 15:14:18 +0100 Subject: [PATCH] When converting json to XML, remove characters that are not allowed in the XML 1.0 specs, as they will cause xpath failures even if escaped --- .../dhp/collection/plugin/utils/JsonUtils.java | 17 +++++++++++++++++ .../collection/plugin/utils/JsonUtilsTest.java | 7 +++++++ 2 files changed, 24 insertions(+) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java index 962c55cfe..1bdc8b138 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java @@ -49,6 +49,23 @@ public class JsonUtils { res.put(i, cleanValue(array.opt(i))); } return res; + } else if (object instanceof String) { + String value = (String) object; + + // XML 1.0 Allowed characters + // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + + return value + .codePoints() + .filter( + cp -> cp == 0x9 || cp == 0xA || cp == 0xD || (cp >= 0x20 && cp <= 0xD7FF) + || (cp >= 0xE000 && cp <= 0xFFFD) + || (cp >= 0x10000 && cp <= 0x10FFFF)) + .collect( + StringBuilder::new, + StringBuilder::appendCodePoint, + StringBuilder::append) + .toString(); } return object; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java index 5f340d6d8..59d24fdc2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java @@ -38,4 +38,11 @@ class JsonUtilsTest { wrapped("12"), JsonUtils.convertToXML("{\"parent\": [{\"id\": 1}, {\"id\": 2}]}")); } + + @Test + void removeControlCharacters() { + assertEquals( + wrapped("Test"), + JsonUtils.convertToXML("{\"100\" : {\"200v\" : \"\\u0000\\u000cTest\"}}")); + } }