package eu.dnetlib.dhp.collection.worker.utils; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; /** @author jochen, Andreas Czerniak */ public class XmlCleaner { /** Pattern for numeric entities. */ private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); // //$NON-NLS-1$ // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); /** * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD * | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] */ private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ // Map entities to their unicode equivalent private static Set goodEntities = new HashSet<>(); private static Map badEntities = new HashMap<>(); static { // pre-defined XML entities goodEntities.add("""); // $NON-NLS-1$ // quotation mark goodEntities.add("&"); // $NON-NLS-1$ // ampersand goodEntities.add("<"); // $NON-NLS-1$ // less-than sign goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign // control entities // badEntities.put(" ", ""); badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character // misc entities badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro badEntities.put( "‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark badEntities.put( "’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark // Latin 1 entities badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space badEntities.put( "¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign badEntities.put( "ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator badEntities.put( "«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one badEntities.put( "º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator badEntities.put( "»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark badEntities.put( "¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter badEntities.put( "½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half badEntities.put( "¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters badEntities.put( "¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question mark badEntities.put( "À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave badEntities.put( "Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute badEntities.put( "Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex badEntities.put( "Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde badEntities.put( "Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis badEntities.put( "Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above badEntities.put( "Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE badEntities.put( "Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla badEntities.put( "È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave badEntities.put( "É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute badEntities.put( "Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex badEntities.put( "Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis badEntities.put( "Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave badEntities.put( "Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute badEntities.put( "Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex badEntities.put( "Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH badEntities.put( "Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde badEntities.put( "Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave badEntities.put( "Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute badEntities.put( "Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex badEntities.put( "Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde badEntities.put( "Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign badEntities.put( "Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke badEntities.put( "Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave badEntities.put( "Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute badEntities.put( "Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex badEntities.put( "Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis badEntities.put( "Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute badEntities.put( "Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN badEntities.put( "ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s badEntities.put( "à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave badEntities.put( "á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute badEntities.put( "â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex badEntities.put( "ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde badEntities.put( "ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis badEntities.put( "å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae badEntities.put( "ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla badEntities.put( "è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave badEntities.put( "é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute badEntities.put( "ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex badEntities.put( "ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis badEntities.put( "ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave badEntities.put( "í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute badEntities.put( "î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex badEntities.put( "ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth badEntities.put( "ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde badEntities.put( "ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave badEntities.put( "ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute badEntities.put( "ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex badEntities.put( "õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde badEntities.put( "ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign badEntities.put( "ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke badEntities.put( "ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave badEntities.put( "ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute badEntities.put( "û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex badEntities.put( "ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis badEntities.put( "ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute badEntities.put( "þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn badEntities.put( "ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis } /** * For each entity in the input that is not allowed in XML, replace the entity with its unicode * equivalent or remove it. For each instance of a bare {@literal &}, replace it with {@literal * &
} XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, * {@literal &lt;} and {@literal &gt;}. * * @param broken the string to handle entities * @return the string with entities appropriately fixed up */ public static String cleanAllEntities(final String broken) { if (broken == null) { return null; } String working = invalidControlCharPattern.matcher(broken).replaceAll(""); working = invalidCharacterPattern.matcher(working).replaceAll(""); int cleanfrom = 0; while (true) { int amp = working.indexOf('&', cleanfrom); // If there are no more amps then we are done if (amp == -1) { break; } // Skip references of the kind &#ddd; if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { cleanfrom = working.indexOf(';', amp) + 1; continue; } int i = amp + 1; while (true) { // if we are at the end of the string then just escape the '&'; if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ } // if we have come to a ; then we have an entity // If it is something that xml can't handle then replace it. final char c = working.charAt(i); if (c == ';') { final String entity = working.substring(amp, i + 1); final String replace = handleEntity(entity); working = working.substring(0, amp) + replace + working.substring(i + 1); break; } // Did we end an entity without finding a closing ; // Then treat it as an '&' that needs to be replaced with & if (!Character.isLetterOrDigit(c)) { working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ amp = i + 4; // account for the 4 extra characters break; } i++; } cleanfrom = amp + 1; } if (Pattern.compile("<<").matcher(working).find()) { working = working.replaceAll("<<", "<<"); } if (Pattern.compile(">>").matcher(working).find()) { working = working.replaceAll(">>", ">>"); } return working; } /** * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip * it out. XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. * * @param entity the entity to be replaced * @return the substitution for the entity, either itself, the unicode equivalent or an empty * string. */ private static String handleEntity(final String entity) { if (goodEntities.contains(entity)) { return entity; } final String replace = badEntities.get(entity); if (replace != null) { return replace; } return replace != null ? replace : ""; } }