From 773e856550e7bebc74b35b2ef3dc0a67869c0938 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Sat, 24 Feb 2024 16:54:30 +0100 Subject: [PATCH] Revised procedure when converting json data into xml: - json object keys are renamed to be conformant to xml tag elements, special characters are substituted or removed - json string values are no longer post-processed as they are already escaped by the org.json.XML.toString method --- .../collection/plugin/rest/RestIterator.java | 8 +- .../collection/plugin/utils/JsonUtils.java | 165 +++++++++++------- .../plugin/rest/RestIteratorTest.java | 3 +- .../plugin/utils/JsonUtilsTest.java | 41 +++++ 4 files changed, 150 insertions(+), 67 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index e4bad2f8d6..1107bcf46e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -52,8 +52,6 @@ public class RestIterator implements Iterator { private final String BASIC = "basic"; - private final JsonUtils jsonUtils; - private final String baseUrl; private final String resumptionType; private final String resumptionParam; @@ -106,7 +104,6 @@ public class RestIterator implements Iterator { final String resultOutputFormat) { this.clientParams = clientParams; - this.jsonUtils = new JsonUtils(); this.baseUrl = baseUrl; this.resumptionType = resumptionType; this.resumptionParam = resumptionParam; @@ -126,6 +123,7 @@ public class RestIterator implements Iterator { } catch (Exception e) { throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); } + initQueue(); } @@ -190,7 +188,7 @@ public class RestIterator implements Iterator { String resultJson; String resultXml = ""; String nextQuery = ""; - String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; + String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + ">"; Node resultNode = null; NodeList nodeList = null; String qUrlArgument = ""; @@ -231,7 +229,7 @@ public class RestIterator implements Iterator { resultStream = theHttpInputStream; if ("json".equals(resultOutputFormat)) { resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8); - resultXml = jsonUtils.convertToXML(resultJson); + resultXml = JsonUtils.convertToXML(resultJson); resultStream = IOUtils.toInputStream(resultXml, UTF_8); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java index 15401e2230..962c55cfee 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java @@ -3,82 +3,125 @@ package eu.dnetlib.dhp.collection.plugin.utils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.json.JSONArray; +import org.json.JSONObject; public class JsonUtils { + public static final String XML_WRAP_TAG = "recordWrap"; + private static final String XML_HEADER = ""; + private static final String INVALID_XMLTAG_CHARS = "!\"#$%&'()*+,/;<=>?@[\\]^`{|}~,"; private static final Log log = LogFactory.getLog(JsonUtils.class); - public static final String wrapName = "recordWrap"; - /** - * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' + * cleanup in JSON-KeyName * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names * and work-around for the JSON to XML converting of org.json.XML-package. * - * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], - * - * @param jsonInput - * @return convertedJsonKeynameOutput + * @param input + * @return converted json object */ - public String syntaxConvertJsonKeyNames(String jsonInput) { - - log.trace("before convertJsonKeyNames: " + jsonInput); - // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) - // replace ' 's in JSON Namens with '_' - while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); + public static JSONObject cleanJsonObject(final JSONObject input) { + if (null == input) { + return null; } - // replace forward-slash (sign '/' ) in JSON Names with '_' - while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); + JSONObject result = new JSONObject(); + + for (String key : input.keySet()) { + Object value = input.opt(key); + if (value != null) { + result.put(cleanKey(key), cleanValue(value)); + } } - // replace '(' in JSON Names with '' - while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); - } - - // replace ')' in JSON Names with '' - while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); - } - - // add prefix of startNumbers in JSON Keynames with 'n_' - while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); - } - // add prefix of only numbers in JSON Keynames with 'm_' - while (jsonInput.matches(".*\"([0-9]+)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); - } - - // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' - while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); - } - - // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. - // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { - // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); - // } - - // replace '=' in JSON Keynames with '-' - while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { - jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); - } - - log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); - return jsonInput; + return result; } - public String convertToXML(final String jsonRecord) { - String resultXml = ""; - org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); - resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element - log.trace("before inputStream: " + resultXml); - resultXml = XmlCleaner.cleanAllEntities(resultXml); - log.trace("after cleaning: " + resultXml); - return resultXml; + private static Object cleanValue(Object object) { + if (object instanceof JSONObject) { + return cleanJsonObject((JSONObject) object); + } else if (object instanceof JSONArray) { + JSONArray array = (JSONArray) object; + JSONArray res = new JSONArray(); + + for (int i = array.length() - 1; i >= 0; i--) { + res.put(i, cleanValue(array.opt(i))); + } + return res; + } + + return object; + } + + private static String cleanKey(String key) { + if (key == null || key.isEmpty()) { + return key; + } + + // xml tag cannot begin with "-", ".", or a numeric digit. + switch (key.charAt(0)) { + case '-': + case '.': + key = "_" + key.substring(1); + break; + } + + if (Character.isDigit(key.charAt(0))) { + if (key.matches("^[0-9]+$")) { + // add prefix of only numbers in JSON Keynames with 'm_' + key = "m_" + key; + } else { + // add prefix of startNumbers in JSON Keynames with 'n_' + key = "n_" + key; + } + } + + StringBuilder res = new StringBuilder(key.length()); + for (int i = 0; i < key.length(); i++) { + char c = key.charAt(i); + + // sequence of whitespaces are rendered as a single '_' + if (Character.isWhitespace(c)) { + while (i + 1 < key.length() && Character.isWhitespace(key.charAt(i + 1))) { + i++; + } + res.append('_'); + } + // remove invalid chars for xml tags with the expception of '=' and '/' + else if (INVALID_XMLTAG_CHARS.indexOf(c) >= 0) { + switch (c) { + case '=': + res.append('-'); + break; + case '/': + res.append('_'); + break; + default: + break; + } + // nothing + } + // all other chars are kept + else { + res.append(c); + } + } + + return res.toString(); + } + + static public String convertToXML(final String jsonRecord) { + if (log.isTraceEnabled()) { + log.trace("input json: " + jsonRecord); + } + + JSONObject jsonObject = cleanJsonObject(new org.json.JSONObject(jsonRecord)); + String res = XML_HEADER + org.json.XML.toString(jsonObject, XML_WRAP_TAG); // wrap xml in single root element + + if (log.isTraceEnabled()) { + log.trace("outout xml: " + res); + } + return res; } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java index 906f69dc9e..e2d6ad3e7f 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -9,6 +9,7 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; /** @@ -37,7 +38,7 @@ public class RestIteratorTest { @Disabled @Test - public void test() { + public void test() throws CollectorException { HttpClientParams clientParams = new HttpClientParams(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java new file mode 100644 index 0000000000..5f340d6d85 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtilsTest.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +class JsonUtilsTest { + + static private String wrapped(String xml) { + return "" + xml + ""; + } + + @Test + void keyStartWithDigit() { + assertEquals( + wrapped("null"), + JsonUtils.convertToXML("{\"100\" : {\"200v\" : null}}")); + } + + @Test + void keyStartWithSpecialchars() { + assertEquals( + wrapped("<_parent><_nest1><_nest2>null"), + JsonUtils.convertToXML("{\" parent\" : {\"-nest1\" : {\".nest2\" : null}}}")); + } + + @Test + void encodeArray() { + assertEquals( + wrapped("<_parent.child>1<_parent.child>2"), + JsonUtils.convertToXML("{\" parent.child\":[1, 2]}")); + } + + @Test + void arrayOfObjects() { + assertEquals( + wrapped("12"), + JsonUtils.convertToXML("{\"parent\": [{\"id\": 1}, {\"id\": 2}]}")); + } +}