[stats wf] indicators across stats dbs & updates in the org ids #248

Closed
dimitris.pierrakos wants to merge 1742 commits from beta into beta2master_sept_2022
4 changed files with 150 additions and 67 deletions
Showing only changes of commit 56dd05f85c - Show all commits

View File

@ -52,8 +52,6 @@ public class RestIterator implements Iterator<String> {
private final String BASIC = "basic"; private final String BASIC = "basic";
private final JsonUtils jsonUtils;
private final String baseUrl; private final String baseUrl;
private final String resumptionType; private final String resumptionType;
private final String resumptionParam; private final String resumptionParam;
@ -106,7 +104,6 @@ public class RestIterator implements Iterator<String> {
final String resultOutputFormat) { final String resultOutputFormat) {
this.clientParams = clientParams; this.clientParams = clientParams;
this.jsonUtils = new JsonUtils();
this.baseUrl = baseUrl; this.baseUrl = baseUrl;
this.resumptionType = resumptionType; this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam; this.resumptionParam = resumptionParam;
@ -126,6 +123,7 @@ public class RestIterator implements Iterator<String> {
} catch (Exception e) { } catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
} }
initQueue(); initQueue();
} }
@ -190,7 +188,7 @@ public class RestIterator implements Iterator<String> {
String resultJson; String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = ""; String nextQuery = "";
String emptyXml = resultXml + "<" + JsonUtils.wrapName + "></" + JsonUtils.wrapName + ">"; String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
Node resultNode = null; Node resultNode = null;
NodeList nodeList = null; NodeList nodeList = null;
String qUrlArgument = ""; String qUrlArgument = "";
@ -231,7 +229,7 @@ public class RestIterator implements Iterator<String> {
resultStream = theHttpInputStream; resultStream = theHttpInputStream;
if ("json".equals(resultOutputFormat)) { if ("json".equals(resultOutputFormat)) {
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8); resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
resultXml = jsonUtils.convertToXML(resultJson); resultXml = JsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, UTF_8); resultStream = IOUtils.toInputStream(resultXml, UTF_8);
} }

View File

@ -3,82 +3,125 @@ package eu.dnetlib.dhp.collection.plugin.utils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
import org.json.JSONObject;
public class JsonUtils { public class JsonUtils {
public static final String XML_WRAP_TAG = "recordWrap";
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
private static final String INVALID_XMLTAG_CHARS = "!\"#$%&'()*+,/;<=>?@[\\]^`{|}~,";
private static final Log log = LogFactory.getLog(JsonUtils.class); private static final Log log = LogFactory.getLog(JsonUtils.class);
public static final String wrapName = "recordWrap";
/** /**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' * cleanup in JSON-KeyName
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package. * and work-around for the JSON to XML converting of org.json.XML-package.
* *
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], * @param input
* * @return converted json object
* @param jsonInput
* @return convertedJsonKeynameOutput
*/ */
public String syntaxConvertJsonKeyNames(String jsonInput) { public static JSONObject cleanJsonObject(final JSONObject input) {
if (null == input) {
log.trace("before convertJsonKeyNames: " + jsonInput); return null;
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
} }
// replace forward-slash (sign '/' ) in JSON Names with '_' JSONObject result = new JSONObject();
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); for (String key : input.keySet()) {
Object value = input.opt(key);
if (value != null) {
result.put(cleanKey(key), cleanValue(value));
}
} }
// replace '(' in JSON Names with '' return result;
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
} }
// replace ')' in JSON Names with '' private static Object cleanValue(Object object) {
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { if (object instanceof JSONObject) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); return cleanJsonObject((JSONObject) object);
} else if (object instanceof JSONArray) {
JSONArray array = (JSONArray) object;
JSONArray res = new JSONArray();
for (int i = array.length() - 1; i >= 0; i--) {
res.put(i, cleanValue(array.opt(i)));
}
return res;
} }
// add prefix of startNumbers in JSON Keynames with 'n_' return object;
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
} }
private static String cleanKey(String key) {
if (key == null || key.isEmpty()) {
return key;
}
// xml tag cannot begin with "-", ".", or a numeric digit.
switch (key.charAt(0)) {
case '-':
case '.':
key = "_" + key.substring(1);
break;
}
if (Character.isDigit(key.charAt(0))) {
if (key.matches("^[0-9]+$")) {
// add prefix of only numbers in JSON Keynames with 'm_' // add prefix of only numbers in JSON Keynames with 'm_'
while (jsonInput.matches(".*\"([0-9]+)\":.*")) { key = "m_" + key;
jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); } else {
// add prefix of startNumbers in JSON Keynames with 'n_'
key = "n_" + key;
}
} }
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' StringBuilder res = new StringBuilder(key.length());
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { for (int i = 0; i < key.length(); i++) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); char c = key.charAt(i);
// sequence of whitespaces are rendered as a single '_'
if (Character.isWhitespace(c)) {
while (i + 1 < key.length() && Character.isWhitespace(key.charAt(i + 1))) {
i++;
}
res.append('_');
}
// remove invalid chars for xml tags with the expception of '=' and '/'
else if (INVALID_XMLTAG_CHARS.indexOf(c) >= 0) {
switch (c) {
case '=':
res.append('-');
break;
case '/':
res.append('_');
break;
default:
break;
}
// nothing
}
// all other chars are kept
else {
res.append(c);
}
} }
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. return res.toString();
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
} }
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); static public String convertToXML(final String jsonRecord) {
return jsonInput; if (log.isTraceEnabled()) {
log.trace("input json: " + jsonRecord);
} }
public String convertToXML(final String jsonRecord) { JSONObject jsonObject = cleanJsonObject(new org.json.JSONObject(jsonRecord));
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; String res = XML_HEADER + org.json.XML.toString(jsonObject, XML_WRAP_TAG); // wrap xml in single root element
org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord));
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element if (log.isTraceEnabled()) {
log.trace("before inputStream: " + resultXml); log.trace("outout xml: " + res);
resultXml = XmlCleaner.cleanAllEntities(resultXml); }
log.trace("after cleaning: " + resultXml); return res;
return resultXml;
} }
} }

View File

@ -9,6 +9,7 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
/** /**
@ -37,7 +38,7 @@ public class RestIteratorTest {
@Disabled @Disabled
@Test @Test
public void test() { public void test() throws CollectorException {
HttpClientParams clientParams = new HttpClientParams(); HttpClientParams clientParams = new HttpClientParams();

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.collection.plugin.utils;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
class JsonUtilsTest {
static private String wrapped(String xml) {
return "<?xml version=\"1.0\" encoding=\"UTF-8\"?><recordWrap>" + xml + "</recordWrap>";
}
@Test
void keyStartWithDigit() {
assertEquals(
wrapped("<m_100><n_200v>null</n_200v></m_100>"),
JsonUtils.convertToXML("{\"100\" : {\"200v\" : null}}"));
}
@Test
void keyStartWithSpecialchars() {
assertEquals(
wrapped("<_parent><_nest1><_nest2>null</_nest2></_nest1></_parent>"),
JsonUtils.convertToXML("{\" parent\" : {\"-nest1\" : {\".nest2\" : null}}}"));
}
@Test
void encodeArray() {
assertEquals(
wrapped("<_parent.child>1</_parent.child><_parent.child>2</_parent.child>"),
JsonUtils.convertToXML("{\" parent.child\":[1, 2]}"));
}
@Test
void arrayOfObjects() {
assertEquals(
wrapped("<parent><id>1</id></parent><parent><id>2</id></parent>"),
JsonUtils.convertToXML("{\"parent\": [{\"id\": 1}, {\"id\": 2}]}"));
}
}