2019-10-24 11:36:59 +02:00
|
|
|
package eu.dnetlib.dhp.collection.worker.utils;
|
2019-04-03 16:05:16 +02:00
|
|
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.HashSet;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Set;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
/** @author jochen, Andreas Czerniak */
|
2019-04-03 16:05:16 +02:00
|
|
|
public class XmlCleaner {
|
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
/** Pattern for numeric entities. */
|
|
|
|
private static Pattern validCharacterEntityPattern =
|
|
|
|
Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$
|
|
|
|
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};");
|
|
|
|
// //$NON-NLS-1$
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
|
|
|
|
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
/**
|
|
|
|
* Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD
|
|
|
|
* | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
|
|
|
*/
|
|
|
|
private static Pattern invalidCharacterPattern =
|
|
|
|
Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
// Map entities to their unicode equivalent
|
|
|
|
private static Set<String> goodEntities = new HashSet<>();
|
|
|
|
private static Map<String, String> badEntities = new HashMap<>();
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
static {
|
|
|
|
// pre-defined XML entities
|
|
|
|
goodEntities.add("""); // $NON-NLS-1$ // quotation mark
|
|
|
|
goodEntities.add("&"); // $NON-NLS-1$ // ampersand
|
|
|
|
goodEntities.add("<"); // $NON-NLS-1$ // less-than sign
|
|
|
|
goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign
|
|
|
|
// control entities
|
|
|
|
// badEntities.put("", "");
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
|
|
|
// misc entities
|
|
|
|
badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro
|
|
|
|
badEntities.put(
|
|
|
|
"‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
|
|
|
|
badEntities.put(
|
|
|
|
"’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
|
|
|
|
// Latin 1 entities
|
|
|
|
badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space
|
|
|
|
badEntities.put(
|
|
|
|
"¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
|
|
|
|
badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign
|
|
|
|
badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign
|
|
|
|
badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign
|
|
|
|
badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign
|
|
|
|
badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
|
|
|
|
badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign
|
|
|
|
badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis
|
|
|
|
badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign
|
|
|
|
badEntities.put(
|
|
|
|
"ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
|
|
|
|
badEntities.put(
|
|
|
|
"«",
|
|
|
|
"\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
|
|
|
|
badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign
|
|
|
|
badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
|
|
|
|
badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign
|
|
|
|
badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron
|
|
|
|
badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign
|
|
|
|
badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
|
|
|
|
badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two
|
|
|
|
badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three
|
|
|
|
badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent
|
|
|
|
badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign
|
|
|
|
badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
|
|
|
|
badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot
|
|
|
|
badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla
|
|
|
|
badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one
|
|
|
|
badEntities.put(
|
|
|
|
"º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
|
|
|
|
badEntities.put(
|
|
|
|
"»",
|
|
|
|
"\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
|
|
|
|
badEntities.put(
|
|
|
|
"¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
|
|
|
|
badEntities.put(
|
|
|
|
"½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
|
|
|
|
badEntities.put(
|
|
|
|
"¾",
|
|
|
|
"\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
|
|
|
|
badEntities.put(
|
|
|
|
"¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
|
|
|
|
badEntities.put(
|
|
|
|
"À",
|
|
|
|
"\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
|
|
|
|
badEntities.put(
|
|
|
|
"Á",
|
|
|
|
"\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Â",
|
|
|
|
"\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"Ã",
|
|
|
|
"\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"Ä",
|
|
|
|
"\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"Å",
|
|
|
|
"\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
|
|
|
|
badEntities.put(
|
|
|
|
"Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
|
|
|
|
badEntities.put(
|
|
|
|
"Ç",
|
|
|
|
"\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
|
|
|
|
badEntities.put(
|
|
|
|
"È",
|
|
|
|
"\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
|
|
|
|
badEntities.put(
|
|
|
|
"É",
|
|
|
|
"\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Ê",
|
|
|
|
"\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"Ë",
|
|
|
|
"\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"Ì",
|
|
|
|
"\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
|
|
|
|
badEntities.put(
|
|
|
|
"Í",
|
|
|
|
"\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Î",
|
|
|
|
"\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"Ï",
|
|
|
|
"\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
|
|
|
|
badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
|
|
|
|
badEntities.put(
|
|
|
|
"Ñ",
|
|
|
|
"\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"Ò",
|
|
|
|
"\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
|
|
|
|
badEntities.put(
|
|
|
|
"Ó",
|
|
|
|
"\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Ô",
|
|
|
|
"\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"Õ",
|
|
|
|
"\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"Ö",
|
|
|
|
"\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
|
|
|
|
badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
|
|
|
|
badEntities.put(
|
|
|
|
"Ø",
|
|
|
|
"\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
|
|
|
|
badEntities.put(
|
|
|
|
"Ù",
|
|
|
|
"\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
|
|
|
|
badEntities.put(
|
|
|
|
"Ú",
|
|
|
|
"\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Û",
|
|
|
|
"\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"Ü",
|
|
|
|
"\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"Ý",
|
|
|
|
"\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
|
|
|
|
badEntities.put(
|
|
|
|
"Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
|
|
|
|
badEntities.put(
|
|
|
|
"ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
|
|
|
|
badEntities.put(
|
|
|
|
"à",
|
|
|
|
"\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
|
|
|
|
badEntities.put(
|
|
|
|
"á",
|
|
|
|
"\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
|
|
|
|
badEntities.put(
|
|
|
|
"â",
|
|
|
|
"\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"ã",
|
|
|
|
"\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"ä",
|
|
|
|
"\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"å",
|
|
|
|
"\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
|
|
|
|
badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
|
|
|
|
badEntities.put(
|
|
|
|
"ç",
|
|
|
|
"\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
|
|
|
|
badEntities.put(
|
|
|
|
"è",
|
|
|
|
"\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
|
|
|
|
badEntities.put(
|
|
|
|
"é",
|
|
|
|
"\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
|
|
|
|
badEntities.put(
|
|
|
|
"ê",
|
|
|
|
"\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"ë",
|
|
|
|
"\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"ì",
|
|
|
|
"\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
|
|
|
|
badEntities.put(
|
|
|
|
"í",
|
|
|
|
"\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
|
|
|
|
badEntities.put(
|
|
|
|
"î",
|
|
|
|
"\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"ï",
|
|
|
|
"\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
|
|
|
|
badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
|
|
|
|
badEntities.put(
|
|
|
|
"ñ",
|
|
|
|
"\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"ò",
|
|
|
|
"\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
|
|
|
|
badEntities.put(
|
|
|
|
"ó",
|
|
|
|
"\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
|
|
|
|
badEntities.put(
|
|
|
|
"ô",
|
|
|
|
"\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"õ",
|
|
|
|
"\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
|
|
|
|
badEntities.put(
|
|
|
|
"ö",
|
|
|
|
"\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
|
|
|
|
badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign
|
|
|
|
badEntities.put(
|
|
|
|
"ø",
|
|
|
|
"\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
|
|
|
|
badEntities.put(
|
|
|
|
"ù",
|
|
|
|
"\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
|
|
|
|
badEntities.put(
|
|
|
|
"ú",
|
|
|
|
"\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
|
|
|
|
badEntities.put(
|
|
|
|
"û",
|
|
|
|
"\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
|
|
|
|
badEntities.put(
|
|
|
|
"ü",
|
|
|
|
"\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
|
|
|
|
badEntities.put(
|
|
|
|
"ý",
|
|
|
|
"\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
|
|
|
|
badEntities.put(
|
|
|
|
"þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
|
|
|
|
badEntities.put(
|
|
|
|
"ÿ",
|
|
|
|
"\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
/**
|
|
|
|
* For each entity in the input that is not allowed in XML, replace the entity with its unicode
|
|
|
|
* equivalent or remove it. For each instance of a bare {@literal &}, replace it with {@literal
|
|
|
|
* &<br/> } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;},
|
|
|
|
* {@literal &lt;} and {@literal &gt;}.
|
|
|
|
*
|
|
|
|
* @param broken the string to handle entities
|
|
|
|
* @return the string with entities appropriately fixed up
|
|
|
|
*/
|
|
|
|
public static String cleanAllEntities(final String broken) {
|
|
|
|
if (broken == null) {
|
|
|
|
return null;
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
|
|
|
|
working = invalidCharacterPattern.matcher(working).replaceAll("");
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
int cleanfrom = 0;
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
while (true) {
|
|
|
|
int amp = working.indexOf('&', cleanfrom);
|
|
|
|
// If there are no more amps then we are done
|
|
|
|
if (amp == -1) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Skip references of the kind &#ddd;
|
|
|
|
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
|
|
|
|
cleanfrom = working.indexOf(';', amp) + 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
int i = amp + 1;
|
|
|
|
while (true) {
|
|
|
|
// if we are at the end of the string then just escape the '&';
|
|
|
|
if (i >= working.length()) {
|
|
|
|
return working.substring(0, amp)
|
|
|
|
+ "&"
|
|
|
|
+ working.substring(amp + 1); // $NON-NLS-1$
|
|
|
|
}
|
|
|
|
// if we have come to a ; then we have an entity
|
|
|
|
// If it is something that xml can't handle then replace it.
|
|
|
|
final char c = working.charAt(i);
|
|
|
|
if (c == ';') {
|
|
|
|
final String entity = working.substring(amp, i + 1);
|
|
|
|
final String replace = handleEntity(entity);
|
|
|
|
working = working.substring(0, amp) + replace + working.substring(i + 1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Did we end an entity without finding a closing ;
|
|
|
|
// Then treat it as an '&' that needs to be replaced with &
|
|
|
|
if (!Character.isLetterOrDigit(c)) {
|
|
|
|
working =
|
|
|
|
working.substring(0, amp)
|
|
|
|
+ "&"
|
|
|
|
+ working.substring(amp + 1); // $NON-NLS-1$
|
|
|
|
amp = i + 4; // account for the 4 extra characters
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
cleanfrom = amp + 1;
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
if (Pattern.compile("<<").matcher(working).find()) {
|
|
|
|
working = working.replaceAll("<<", "<<");
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
if (Pattern.compile(">>").matcher(working).find()) {
|
|
|
|
working = working.replaceAll(">>", ">>");
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
return working;
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
/**
|
|
|
|
* Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip
|
|
|
|
* it out. XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;.
|
|
|
|
*
|
|
|
|
* @param entity the entity to be replaced
|
|
|
|
* @return the substitution for the entity, either itself, the unicode equivalent or an empty
|
|
|
|
* string.
|
|
|
|
*/
|
|
|
|
private static String handleEntity(final String entity) {
|
|
|
|
if (goodEntities.contains(entity)) {
|
|
|
|
return entity;
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
final String replace = badEntities.get(entity);
|
|
|
|
if (replace != null) {
|
|
|
|
return replace;
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
|
2020-04-18 12:42:58 +02:00
|
|
|
return replace != null ? replace : "";
|
|
|
|
}
|
2019-04-03 16:05:16 +02:00
|
|
|
}
|