forked from D-Net/dnet-hadoop
Revised procedure when converting json data into xml:
- json object keys are renamed to be conformant to xml tag elements, special characters are substituted or removed - json string values are no longer post-processed as they are already escaped by the org.json.XML.toString method
This commit is contained in:
parent
3268570b2c
commit
773e856550
|
@ -52,8 +52,6 @@ public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
private final String BASIC = "basic";
|
private final String BASIC = "basic";
|
||||||
|
|
||||||
private final JsonUtils jsonUtils;
|
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final String resumptionType;
|
private final String resumptionType;
|
||||||
private final String resumptionParam;
|
private final String resumptionParam;
|
||||||
|
@ -106,7 +104,6 @@ public class RestIterator implements Iterator<String> {
|
||||||
final String resultOutputFormat) {
|
final String resultOutputFormat) {
|
||||||
|
|
||||||
this.clientParams = clientParams;
|
this.clientParams = clientParams;
|
||||||
this.jsonUtils = new JsonUtils();
|
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.resumptionType = resumptionType;
|
this.resumptionType = resumptionType;
|
||||||
this.resumptionParam = resumptionParam;
|
this.resumptionParam = resumptionParam;
|
||||||
|
@ -126,6 +123,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
initQueue();
|
initQueue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -190,7 +188,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
String resultJson;
|
String resultJson;
|
||||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
String nextQuery = "";
|
String nextQuery = "";
|
||||||
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">";
|
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||||
Node resultNode = null;
|
Node resultNode = null;
|
||||||
NodeList nodeList = null;
|
NodeList nodeList = null;
|
||||||
String qUrlArgument = "";
|
String qUrlArgument = "";
|
||||||
|
@ -231,7 +229,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
resultStream = theHttpInputStream;
|
resultStream = theHttpInputStream;
|
||||||
if ("json".equals(resultOutputFormat)) {
|
if ("json".equals(resultOutputFormat)) {
|
||||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
||||||
resultXml = jsonUtils.convertToXML(resultJson);
|
resultXml = JsonUtils.convertToXML(resultJson);
|
||||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,82 +3,125 @@ package eu.dnetlib.dhp.collection.plugin.utils;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
|
||||||
public class JsonUtils {
|
public class JsonUtils {
|
||||||
|
public static final String XML_WRAP_TAG = "recordWrap";
|
||||||
|
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
|
private static final String INVALID_XMLTAG_CHARS = "!\"#$%&'()*+,/;<=>?@[\\]^`{|}~,";
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(JsonUtils.class);
|
private static final Log log = LogFactory.getLog(JsonUtils.class);
|
||||||
|
|
||||||
public static final String wrapName = "recordWrap";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
|
* cleanup in JSON-KeyName
|
||||||
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
|
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
|
||||||
* and work-around for the JSON to XML converting of org.json.XML-package.
|
* and work-around for the JSON to XML converting of org.json.XML-package.
|
||||||
*
|
*
|
||||||
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
|
* @param input
|
||||||
*
|
* @return converted json object
|
||||||
* @param jsonInput
|
|
||||||
* @return convertedJsonKeynameOutput
|
|
||||||
*/
|
*/
|
||||||
public String syntaxConvertJsonKeyNames(String jsonInput) {
|
public static JSONObject cleanJsonObject(final JSONObject input) {
|
||||||
|
if (null == input) {
|
||||||
log.trace("before convertJsonKeyNames: " + jsonInput);
|
return null;
|
||||||
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
|
||||||
// replace ' 's in JSON Namens with '_'
|
|
||||||
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// replace forward-slash (sign '/' ) in JSON Names with '_'
|
JSONObject result = new JSONObject();
|
||||||
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
|
for (String key : input.keySet()) {
|
||||||
|
Object value = input.opt(key);
|
||||||
|
if (value != null) {
|
||||||
|
result.put(cleanKey(key), cleanValue(value));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// replace '(' in JSON Names with ''
|
return result;
|
||||||
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace ')' in JSON Names with ''
|
|
||||||
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
// add prefix of startNumbers in JSON Keynames with 'n_'
|
|
||||||
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
|
|
||||||
}
|
|
||||||
// add prefix of only numbers in JSON Keynames with 'm_'
|
|
||||||
while (jsonInput.matches(".*\"([0-9]+)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
|
|
||||||
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
|
|
||||||
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
|
|
||||||
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
|
|
||||||
// }
|
|
||||||
|
|
||||||
// replace '=' in JSON Keynames with '-'
|
|
||||||
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
|
|
||||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
|
|
||||||
}
|
|
||||||
|
|
||||||
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
|
|
||||||
return jsonInput;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String convertToXML(final String jsonRecord) {
|
private static Object cleanValue(Object object) {
|
||||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
if (object instanceof JSONObject) {
|
||||||
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
|
return cleanJsonObject((JSONObject) object);
|
||||||
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
|
} else if (object instanceof JSONArray) {
|
||||||
log.trace("before inputStream: " + resultXml);
|
JSONArray array = (JSONArray) object;
|
||||||
resultXml = XmlCleaner.cleanAllEntities(resultXml);
|
JSONArray res = new JSONArray();
|
||||||
log.trace("after cleaning: " + resultXml);
|
|
||||||
return resultXml;
|
for (int i = array.length() - 1; i >= 0; i--) {
|
||||||
|
res.put(i, cleanValue(array.opt(i)));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
return object;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String cleanKey(String key) {
|
||||||
|
if (key == null || key.isEmpty()) {
|
||||||
|
return key;
|
||||||
|
}
|
||||||
|
|
||||||
|
// xml tag cannot begin with "-", ".", or a numeric digit.
|
||||||
|
switch (key.charAt(0)) {
|
||||||
|
case '-':
|
||||||
|
case '.':
|
||||||
|
key = "_" + key.substring(1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Character.isDigit(key.charAt(0))) {
|
||||||
|
if (key.matches("^[0-9]+$")) {
|
||||||
|
// add prefix of only numbers in JSON Keynames with 'm_'
|
||||||
|
key = "m_" + key;
|
||||||
|
} else {
|
||||||
|
// add prefix of startNumbers in JSON Keynames with 'n_'
|
||||||
|
key = "n_" + key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder res = new StringBuilder(key.length());
|
||||||
|
for (int i = 0; i < key.length(); i++) {
|
||||||
|
char c = key.charAt(i);
|
||||||
|
|
||||||
|
// sequence of whitespaces are rendered as a single '_'
|
||||||
|
if (Character.isWhitespace(c)) {
|
||||||
|
while (i + 1 < key.length() && Character.isWhitespace(key.charAt(i + 1))) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
res.append('_');
|
||||||
|
}
|
||||||
|
// remove invalid chars for xml tags with the expception of '=' and '/'
|
||||||
|
else if (INVALID_XMLTAG_CHARS.indexOf(c) >= 0) {
|
||||||
|
switch (c) {
|
||||||
|
case '=':
|
||||||
|
res.append('-');
|
||||||
|
break;
|
||||||
|
case '/':
|
||||||
|
res.append('_');
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// nothing
|
||||||
|
}
|
||||||
|
// all other chars are kept
|
||||||
|
else {
|
||||||
|
res.append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
static public String convertToXML(final String jsonRecord) {
|
||||||
|
if (log.isTraceEnabled()) {
|
||||||
|
log.trace("input json: " + jsonRecord);
|
||||||
|
}
|
||||||
|
|
||||||
|
JSONObject jsonObject = cleanJsonObject(new org.json.JSONObject(jsonRecord));
|
||||||
|
String res = XML_HEADER + org.json.XML.toString(jsonObject, XML_WRAP_TAG); // wrap xml in single root element
|
||||||
|
|
||||||
|
if (log.isTraceEnabled()) {
|
||||||
|
log.trace("outout xml: " + res);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@ import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,7 +38,7 @@ public class RestIteratorTest {
|
||||||
|
|
||||||
@Disabled
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() throws CollectorException {
|
||||||
|
|
||||||
HttpClientParams clientParams = new HttpClientParams();
|
HttpClientParams clientParams = new HttpClientParams();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
class JsonUtilsTest {
|
||||||
|
|
||||||
|
static private String wrapped(String xml) {
|
||||||
|
return "<?xml version=\"1.0\" encoding=\"UTF-8\"?><recordWrap>" + xml + "</recordWrap>";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void keyStartWithDigit() {
|
||||||
|
assertEquals(
|
||||||
|
wrapped("<m_100><n_200v>null</n_200v></m_100>"),
|
||||||
|
JsonUtils.convertToXML("{\"100\" : {\"200v\" : null}}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void keyStartWithSpecialchars() {
|
||||||
|
assertEquals(
|
||||||
|
wrapped("<_parent><_nest1><_nest2>null</_nest2></_nest1></_parent>"),
|
||||||
|
JsonUtils.convertToXML("{\" parent\" : {\"-nest1\" : {\".nest2\" : null}}}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void encodeArray() {
|
||||||
|
assertEquals(
|
||||||
|
wrapped("<_parent.child>1</_parent.child><_parent.child>2</_parent.child>"),
|
||||||
|
JsonUtils.convertToXML("{\" parent.child\":[1, 2]}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void arrayOfObjects() {
|
||||||
|
assertEquals(
|
||||||
|
wrapped("<parent><id>1</id></parent><parent><id>2</id></parent>"),
|
||||||
|
JsonUtils.convertToXML("{\"parent\": [{\"id\": 1}, {\"id\": 2}]}"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue