@ -6,254 +6,392 @@ import java.util.Map;
import java.util.Set ;
import java.util.regex.Pattern ;
/ * *
* @author jochen , Andreas Czerniak
*
* /
/** @author jochen, Andreas Czerniak */
public class XmlCleaner {
/ * *
* Pattern for numeric entities .
* /
private static Pattern validCharacterEntityPattern = Pattern . compile ( "^&#x?\\d{2,4};" ) ; //$NON-NLS-1$
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
/** Pattern for numeric entities. */
private static Pattern validCharacterEntityPattern =
Pattern . compile ( "^&#x?\\d{2,4};" ) ; // $NON-NLS-1$
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};");
// //$NON-NLS-1$
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
private static Pattern invalidControlCharPattern = Pattern . compile ( "&#x?1[0-9a-fA-F];" ) ;
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
private static Pattern invalidControlCharPattern = Pattern . compile ( "&#x?1[0-9a-fA-F];" ) ;
/ * *
* Pattern that negates the allowable XML 4 byte unicode characters . Valid are : # x9 | # xA | # xD | [ # x20 - # xD7FF ] | [ # xE000 - # xFFFD ] |
* [ # x10000 - # x10FFFF ]
* /
private static Pattern invalidCharacterPattern = Pattern . compile ( "[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]" ) ; //$NON-NLS-1$
/ * *
* Pattern that negates the allowable XML 4 byte unicode characters . Valid are : # x9 | # xA | # xD
* | [ # x20 - # xD7FF ] | [ # xE000 - # xFFFD ] | [ # x10000 - # x10FFFF ]
* /
private static Pattern invalidCharacterPattern =
Pattern . compile ( "[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]" ) ; // $NON-NLS-1$
// Map entities to their unicode equivalent
private static Set < String > goodEntities = new HashSet < > ( ) ;
private static Map < String , String > badEntities = new HashMap < > ( ) ;
// Map entities to their unicode equivalent
private static Set < String > goodEntities = new HashSet < > ( ) ;
private static Map < String , String > badEntities = new HashMap < > ( ) ;
static {
// pre-defined XML entities
goodEntities . add ( """ ) ; //$NON-NLS-1$ // quotation mark
goodEntities . add ( "&" ) ; //$NON-NLS-1$ // ampersand
goodEntities . add ( "<" ) ; //$NON-NLS-1$ // less-than sign
goodEntities . add ( ">" ) ; //$NON-NLS-1$ // greater-than sign
// control entities
// badEntities.put("", "");
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "€" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‚" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ƒ" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "„" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "…" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "†" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‡" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ˆ" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‰" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Š" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‹" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Œ" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Ž" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‘" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "’" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "“" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "”" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "•" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "–" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "—" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "˜" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "™" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "š" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "›" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "œ" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ž" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Ÿ" , " " ) ; //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
// misc entities
badEntities . put ( "€" , "\u20AC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // euro
badEntities . put ( "‘" , "\u2018" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
badEntities . put ( "’" , "\u2019" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
// Latin 1 entities
badEntities . put ( " " , "\u00A0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
badEntities . put ( "¡" , "\u00A1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
badEntities . put ( "¢" , "\u00A2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
badEntities . put ( "£" , "\u00A3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
badEntities . put ( "¤" , "\u00A4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
badEntities . put ( "¥" , "\u00A5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
badEntities . put ( "¦" , "\u00A6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
badEntities . put ( "§" , "\u00A7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // section sign
badEntities . put ( "¨" , "\u00A8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
badEntities . put ( "©" , "\u00A9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
badEntities . put ( "ª" , "\u00AA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
badEntities . put ( "«" , "\u00AB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
badEntities . put ( "¬" , "\u00AC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // not sign
badEntities . put ( "­" , "\u00AD" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
badEntities . put ( "®" , "\u00AE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
badEntities . put ( "¯" , "\u00AF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // macron
badEntities . put ( "°" , "\u00B0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
badEntities . put ( "±" , "\u00B1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
badEntities . put ( "²" , "\u00B2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
badEntities . put ( "³" , "\u00B3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
badEntities . put ( "´" , "\u00B4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
badEntities . put ( "µ" , "\u00B5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
badEntities . put ( "¶" , "\u00B6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
badEntities . put ( "·" , "\u00B7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
badEntities . put ( "¸" , "\u00B8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
badEntities . put ( "¹" , "\u00B9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
badEntities . put ( "º" , "\u00BA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
badEntities . put ( "»" , "\u00BB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
badEntities . put ( "¼" , "\u00BC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
badEntities . put ( "½" , "\u00BD" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
badEntities . put ( "¾" , "\u00BE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
badEntities . put ( "¿" , "\u00BF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
badEntities . put ( "À" , "\u00C0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
badEntities . put ( "Á" , "\u00C1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
badEntities . put ( "Â" , "\u00C2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
badEntities . put ( "Ã" , "\u00C3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
badEntities . put ( "Ä" , "\u00C4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
badEntities . put ( "Å" , "\u00C5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
badEntities . put ( "Æ" , "\u00C6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
badEntities . put ( "Ç" , "\u00C7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
badEntities . put ( "È" , "\u00C8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
badEntities . put ( "É" , "\u00C9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
badEntities . put ( "Ê" , "\u00CA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
badEntities . put ( "Ë" , "\u00CB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
badEntities . put ( "Ì" , "\u00CC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
badEntities . put ( "Í" , "\u00CD" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
badEntities . put ( "Î" , "\u00CE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
badEntities . put ( "Ï" , "\u00CF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
badEntities . put ( "Ð" , "\u00D0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
badEntities . put ( "Ñ" , "\u00D1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
badEntities . put ( "Ò" , "\u00D2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
badEntities . put ( "Ó" , "\u00D3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
badEntities . put ( "Ô" , "\u00D4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
badEntities . put ( "Õ" , "\u00D5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
badEntities . put ( "Ö" , "\u00D6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
badEntities . put ( "×" , "\u00D7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
badEntities . put ( "Ø" , "\u00D8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
badEntities . put ( "Ù" , "\u00D9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
badEntities . put ( "Ú" , "\u00DA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
badEntities . put ( "Û" , "\u00DB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
badEntities . put ( "Ü" , "\u00DC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
badEntities . put ( "Ý" , "\u00DD" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
badEntities . put ( "Þ" , "\u00DE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
badEntities . put ( "ß" , "\u00DF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
badEntities . put ( "à" , "\u00E0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
badEntities . put ( "á" , "\u00E1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
badEntities . put ( "â" , "\u00E2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
badEntities . put ( "ã" , "\u00E3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
badEntities . put ( "ä" , "\u00E4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
badEntities . put ( "å" , "\u00E5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
badEntities . put ( "æ" , "\u00E6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
badEntities . put ( "ç" , "\u00E7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
badEntities . put ( "è" , "\u00E8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
badEntities . put ( "é" , "\u00E9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
badEntities . put ( "ê" , "\u00EA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
badEntities . put ( "ë" , "\u00EB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
badEntities . put ( "ì" , "\u00EC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
badEntities . put ( "í" , "\u00ED" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
badEntities . put ( "î" , "\u00EE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
badEntities . put ( "ï" , "\u00EF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
badEntities . put ( "ð" , "\u00F0" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
badEntities . put ( "ñ" , "\u00F1" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
badEntities . put ( "ò" , "\u00F2" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
badEntities . put ( "ó" , "\u00F3" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
badEntities . put ( "ô" , "\u00F4" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
badEntities . put ( "õ" , "\u00F5" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
badEntities . put ( "ö" , "\u00F6" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
badEntities . put ( "÷" , "\u00F7" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // division sign
badEntities . put ( "ø" , "\u00F8" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
badEntities . put ( "ù" , "\u00F9" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
badEntities . put ( "ú" , "\u00FA" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
badEntities . put ( "û" , "\u00FB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
badEntities . put ( "ü" , "\u00FC" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
badEntities . put ( "ý" , "\u00FD" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
badEntities . put ( "þ" , "\u00FE" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
badEntities . put ( "ÿ" , "\u00FF" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
}
static {
// pre-defined XML entities
goodEntities . add ( """ ) ; // $NON-NLS-1$ // quotation mark
goodEntities . add ( "&" ) ; // $NON-NLS-1$ // ampersand
goodEntities . add ( "<" ) ; // $NON-NLS-1$ // less-than sign
goodEntities . add ( ">" ) ; // $NON-NLS-1$ // greater-than sign
// control entities
// badEntities.put("", "");
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "€" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‚" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ƒ" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "„" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "…" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "†" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‡" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ˆ" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‰" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Š" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‹" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Œ" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Ž" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "‘" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "’" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "“" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "”" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "•" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "–" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "—" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "˜" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "™" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "š" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "›" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "œ" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "ž" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities . put ( "Ÿ" , " " ) ; // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
// misc entities
badEntities . put ( "€" , "\u20AC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // euro
badEntities . put (
"‘" , "\u2018" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
badEntities . put (
"’" , "\u2019" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
// Latin 1 entities
badEntities . put ( " " , "\u00A0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // no-break space
badEntities . put (
"¡" , "\u00A1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
badEntities . put ( "¢" , "\u00A2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // cent sign
badEntities . put ( "£" , "\u00A3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // pound sign
badEntities . put ( "¤" , "\u00A4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // currency sign
badEntities . put ( "¥" , "\u00A5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // yen sign
badEntities . put ( "¦" , "\u00A6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
badEntities . put ( "§" , "\u00A7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // section sign
badEntities . put ( "¨" , "\u00A8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis
badEntities . put ( "©" , "\u00A9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign
badEntities . put (
"ª" , "\u00AA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
badEntities . put (
"«" ,
"\u00AB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
badEntities . put ( "¬" , "\u00AC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // not sign
badEntities . put ( "­" , "\u00AD" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
badEntities . put ( "®" , "\u00AE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // registered sign
badEntities . put ( "¯" , "\u00AF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // macron
badEntities . put ( "°" , "\u00B0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // degree sign
badEntities . put ( "±" , "\u00B1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
badEntities . put ( "²" , "\u00B2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // superscript two
badEntities . put ( "³" , "\u00B3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // superscript three
badEntities . put ( "´" , "\u00B4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // acute accent
badEntities . put ( "µ" , "\u00B5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // micro sign
badEntities . put ( "¶" , "\u00B6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
badEntities . put ( "·" , "\u00B7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // middle dot
badEntities . put ( "¸" , "\u00B8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // cedilla
badEntities . put ( "¹" , "\u00B9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // superscript one
badEntities . put (
"º" , "\u00BA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
badEntities . put (
"»" ,
"\u00BB" ) ; //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
badEntities . put (
"¼" , "\u00BC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
badEntities . put (
"½" , "\u00BD" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
badEntities . put (
"¾" ,
"\u00BE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
badEntities . put (
"¿" , "\u00BF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
badEntities . put (
"À" ,
"\u00C0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
badEntities . put (
"Á" ,
"\u00C1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
badEntities . put (
"Â" ,
"\u00C2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
badEntities . put (
"Ã" ,
"\u00C3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
badEntities . put (
"Ä" ,
"\u00C4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
badEntities . put (
"Å" ,
"\u00C5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
badEntities . put (
"Æ" , "\u00C6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
badEntities . put (
"Ç" ,
"\u00C7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
badEntities . put (
"È" ,
"\u00C8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
badEntities . put (
"É" ,
"\u00C9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
badEntities . put (
"Ê" ,
"\u00CA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
badEntities . put (
"Ë" ,
"\u00CB" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
badEntities . put (
"Ì" ,
"\u00CC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
badEntities . put (
"Í" ,
"\u00CD" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
badEntities . put (
"Î" ,
"\u00CE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
badEntities . put (
"Ï" ,
"\u00CF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
badEntities . put ( "Ð" , "\u00D0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
badEntities . put (
"Ñ" ,
"\u00D1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
badEntities . put (
"Ò" ,
"\u00D2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
badEntities . put (
"Ó" ,
"\u00D3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
badEntities . put (
"Ô" ,
"\u00D4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
badEntities . put (
"Õ" ,
"\u00D5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
badEntities . put (
"Ö" ,
"\u00D6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
badEntities . put ( "×" , "\u00D7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
badEntities . put (
"Ø" ,
"\u00D8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
badEntities . put (
"Ù" ,
"\u00D9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
badEntities . put (
"Ú" ,
"\u00DA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
badEntities . put (
"Û" ,
"\u00DB" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
badEntities . put (
"Ü" ,
"\u00DC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
badEntities . put (
"Ý" ,
"\u00DD" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
badEntities . put (
"Þ" , "\u00DE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
badEntities . put (
"ß" , "\u00DF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
badEntities . put (
"à" ,
"\u00E0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
badEntities . put (
"á" ,
"\u00E1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
badEntities . put (
"â" ,
"\u00E2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
badEntities . put (
"ã" ,
"\u00E3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
badEntities . put (
"ä" ,
"\u00E4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
badEntities . put (
"å" ,
"\u00E5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
badEntities . put ( "æ" , "\u00E6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
badEntities . put (
"ç" ,
"\u00E7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
badEntities . put (
"è" ,
"\u00E8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
badEntities . put (
"é" ,
"\u00E9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
badEntities . put (
"ê" ,
"\u00EA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
badEntities . put (
"ë" ,
"\u00EB" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
badEntities . put (
"ì" ,
"\u00EC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
badEntities . put (
"í" ,
"\u00ED" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
badEntities . put (
"î" ,
"\u00EE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
badEntities . put (
"ï" ,
"\u00EF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
badEntities . put ( "ð" , "\u00F0" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
badEntities . put (
"ñ" ,
"\u00F1" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
badEntities . put (
"ò" ,
"\u00F2" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
badEntities . put (
"ó" ,
"\u00F3" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
badEntities . put (
"ô" ,
"\u00F4" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
badEntities . put (
"õ" ,
"\u00F5" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
badEntities . put (
"ö" ,
"\u00F6" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
badEntities . put ( "÷" , "\u00F7" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // division sign
badEntities . put (
"ø" ,
"\u00F8" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
badEntities . put (
"ù" ,
"\u00F9" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
badEntities . put (
"ú" ,
"\u00FA" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
badEntities . put (
"û" ,
"\u00FB" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
badEntities . put (
"ü" ,
"\u00FC" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
badEntities . put (
"ý" ,
"\u00FD" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
badEntities . put (
"þ" , "\u00FE" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
badEntities . put (
"ÿ" ,
"\u00FF" ) ; // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
}
/ * *
* For each entity in the input that is not allowed in XML , replace the entity with its unicode equivalent or remove it . For each
* instance of a bare { @literal & } , replace it with { @literal & amp ; < br / >
* } XML only allows 4 entities : { @literal & amp ; amp ; } , { @literal & amp ; quot ; } , { @literal & amp ; lt ; } and { @literal & amp ; gt ; } .
*
* @param broken
* the string to handle entities
* @return the string with entities appropriately fixed up
* /
static public String cleanAllEntities ( final String broken ) {
if ( broken = = null ) { return null ; }
/ * *
* For each entity in the input that is not allowed in XML , replace the entity with its unicode
* equivalent or remove it . For each instance of a bare { @literal & } , replace it with { @literal
* & amp ; < br / > } XML only allows 4 entities : { @literal & amp ; amp ; } , { @literal & amp ; quot ; } ,
* { @literal & amp ; lt ; } and { @literal & amp ; gt ; } .
*
* @param broken the string to handle entities
* @return the string with entities appropriately fixed up
* /
public static String cleanAllEntities ( final String broken ) {
if ( broken = = null ) {
return null ;
}
String working = invalidControlCharPattern . matcher ( broken ) . replaceAll ( "" ) ;
working = invalidCharacterPattern . matcher ( working ) . replaceAll ( "" ) ;
String working = invalidControlCharPattern . matcher ( broken ) . replaceAll ( "" ) ;
working = invalidCharacterPattern . matcher ( working ) . replaceAll ( "" ) ;
int cleanfrom = 0 ;
int cleanfrom = 0 ;
while ( true ) {
int amp = working . indexOf ( '&' , cleanfrom ) ;
// If there are no more amps then we are done
if ( amp = = - 1 ) {
break ;
}
// Skip references of the kind &#ddd;
if ( validCharacterEntityPattern . matcher ( working . substring ( amp ) ) . find ( ) ) {
cleanfrom = working . indexOf ( ';' , amp ) + 1 ;
continue ;
}
int i = amp + 1 ;
while ( true ) {
// if we are at the end of the string then just escape the '&';
if ( i > = working . length ( ) ) { return working . substring ( 0 , amp ) + "&" + working . substring ( amp + 1 ) ; //$NON-NLS-1$
}
// if we have come to a ; then we have an entity
// If it is something that xml can't handle then replace it.
final char c = working . charAt ( i ) ;
if ( c = = ';' ) {
final String entity = working . substring ( amp , i + 1 ) ;
final String replace = handleEntity ( entity ) ;
working = working . substring ( 0 , amp ) + replace + working . substring ( i + 1 ) ;
break ;
}
// Did we end an entity without finding a closing ;
// Then treat it as an '&' that needs to be replaced with &
if ( ! Character . isLetterOrDigit ( c ) ) {
working = working . substring ( 0 , amp ) + "&" + working . substring ( amp + 1 ) ; //$NON-NLS-1$
amp = i + 4 ; // account for the 4 extra characters
break ;
}
i + + ;
}
cleanfrom = amp + 1 ;
}
while ( true ) {
int amp = working . indexOf ( '&' , cleanfrom ) ;
// If there are no more amps then we are done
if ( amp = = - 1 ) {
break ;
}
// Skip references of the kind &#ddd;
if ( validCharacterEntityPattern . matcher ( working . substring ( amp ) ) . find ( ) ) {
cleanfrom = working . indexOf ( ';' , amp ) + 1 ;
continue ;
}
int i = amp + 1 ;
while ( true ) {
// if we are at the end of the string then just escape the '&';
if ( i > = working . length ( ) ) {
return working . substring ( 0 , amp )
+ "&"
+ working . substring ( amp + 1 ) ; // $NON-NLS-1$
}
// if we have come to a ; then we have an entity
// If it is something that xml can't handle then replace it.
final char c = working . charAt ( i ) ;
if ( c = = ';' ) {
final String entity = working . substring ( amp , i + 1 ) ;
final String replace = handleEntity ( entity ) ;
working = working . substring ( 0 , amp ) + replace + working . substring ( i + 1 ) ;
break ;
}
// Did we end an entity without finding a closing ;
// Then treat it as an '&' that needs to be replaced with &
if ( ! Character . isLetterOrDigit ( c ) ) {
working =
working . substring ( 0 , amp )
+ "&"
+ working . substring ( amp + 1 ) ; // $NON-NLS-1$
amp = i + 4 ; // account for the 4 extra characters
break ;
}
i + + ;
}
cleanfrom = amp + 1 ;
}
if ( Pattern . compile ( "<<" ) . matcher ( working ) . find ( ) ) {
working = working . replaceAll ( "<<" , "<<" ) ;
}
if ( Pattern . compile ( "<<" ) . matcher ( working ) . find ( ) ) {
working = working . replaceAll ( "<<" , "<<" ) ;
}
if ( Pattern . compile ( ">>" ) . matcher ( working ) . find ( ) ) {
working = working . replaceAll ( ">>" , ">>" ) ;
}
if ( Pattern . compile ( ">>" ) . matcher ( working ) . find ( ) ) {
working = working . replaceAll ( ">>" , ">>" ) ;
}
return working ;
}
return working ;
}
/ * *
* Replace entity with its unicode equivalent , if it is not a valid XML entity . Otherwise strip it out . XML only allows 4 entities :
* & amp ; amp ; , & amp ; quot ; , & amp ; lt ; and & amp ; gt ; .
*
* @param entity
* the entity to be replaced
* @return the substitution for the entity , either itself , the unicode equivalent or an empty string .
* /
private static String handleEntity ( final String entity ) {
if ( goodEntities . contains ( entity ) ) { return entity ; }
/ * *
* Replace entity with its unicode equivalent , if it is not a valid XML entity . Otherwise strip
* it out . XML only allows 4 entities : & amp ; amp ; , & amp ; quot ; , & amp ; lt ; and & amp ; gt ; .
*
* @param entity the entity to be replaced
* @return the substitution for the entity , either itself , the unicode equivalent or an empty
* string .
* /
private static String handleEntity ( final String entity ) {
if ( goodEntities . contains ( entity ) ) {
return entity ;
}
final String replace = badEntities . get ( entity ) ;
if ( replace ! = null ) { return replace ; }
final String replace = badEntities . get ( entity ) ;
if ( replace ! = null ) {
return replace ;
}
return replace ! = null ? replace : "" ;
}
return replace ! = null ? replace : "" ;
}
}