/* * [EntifyStrings.java] * * Summary: Inserts HTML entities such as " into a String replacing the single character equivalents. * * Copyright: (c) 2005-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.6 2009-04-05 StripEntities now leaves a space behind when it removes a

etc tag. * 2.7 2009-11-14 generate a table for the HTML cheat sheet of quote-like entities. * 2.8 2009-12-22 export table on HTML 5 entities. Now import csv file rather than embed entity facts. * 2.9 2010-01-29 export XHTML entities (currently same as HTML-4 entities). * 3.0 2011-01-05 remove deprecated methods. add toHTMLEntity, toXMLEntity, hide charToHTMLEntity, charToXMLEntity * 3.1 2011-02-10 rename methods to htmlEntify and xmlEntify */ package com.mindprod.entities; /** * Inserts HTML entities such as " into a String replacing the single character equivalents. *

* Entities are coded inline in a giant case. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2011-02-10 rename methods to entifyHTML and entifyXML * @see DeEntify * @see DeEntifyStrings * @see Entify * @see EntifyStrings * @see Flatten * @since 2005 */ public class EntifyStrings { /** * /** should we generate hex entities in preference to decimal ones? */ private static boolean preferHexEntities = true; /** * convert a single char to its equivalent HTML entity. Ordinary chars are not changed. 160 -> weird chars * -> &#nnn; form * * @param c Char to convert * * @return equivalent string e.g. &, null means leave char as is. */ private static String charToHTMLEntity( char c ) { switch ( c ) { default: if ( c < 127 ) { // leave alone as equivalent string. return null; // faster than String.valueOf( c ).intern(); } else { if ( preferHexEntities ) { // default, compact, no lead 0 return "&#x" + Integer.toHexString( c ) + ";"; } else { //use the decimal &#nnn; form, compact, no lead 0. return "&#" + Integer.toString( c ) + ";"; } } // end default case // W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E // this code generated by Entities, include from com\mindprod\entities\entitiescase.javafrag // Could be more efficiently handled with a lookup table[9831] // manually sorted numerically. case 34: return """ /* " quotation mark */; case 38: return "&" /* & ampersand */; case 60: return "<" /* < less-than sign */; case 62: return ">" /* > greater-than sign */; case 160: return " " /* non-breaking space */; case 161: return "¡" /* ¡ inverted exclamation mark */; case 162: return "¢" /* ¢ cent sign */; case 163: return "£" /* £ pound sign */; case 164: return "¤" /* ¤ currency sign */; case 165: return "¥" /* ¥ yen sign */; case 166: return "¦" /* ¦ broken bar */; case 167: return "§" /* § section sign */; case 168: return "¨" /* ¨ diaeresis */; case 169: return "©" /* © copyright sign circled c */; case 170: return "ª" /* ª feminine ordinal indicator */; case 171: return "«" /* « left guillemot */; case 172: return "¬" /* ¬ not sign */; case 173: return "" /* soft hyphen */; case 174: return "®" /* ® registered sign. circled R. */; case 175: return "¯" /* ¯ macron */; case 176: return "°" /* ° degree sign */; case 177: return "±" /* ± plus-minus sign */; case 178: return "²" /* ² superscript two */; case 179: return "³" /* ³ superscript three */; case 180: return "´" /* ´ acute accent */; case 181: return "µ" /* µ micro sign */; case 182: return "¶" /* ¶ pilcrow sign */; case 183: return "·" /* · middle dot */; case 184: return "¸" /* ¸ cedilla */; case 185: return "¹" /* ¹ superscript one */; case 186: return "º" /* º masculine ordinal indicator */; case 187: return "»" /* » right guillemot */; case 188: return "¼" /* ¼ vulgar fraction 1/4 */; case 189: return "½" /* ½ vulgar fraction 1/2 */; case 190: return "¾" /* ¾ vulgar fraction 3/4 */; case 191: return "¿" /* ¿ inverted question mark */; case 192: return "À" /* À Latin capital letter A with grave */; case 193: return "Á" /* Á Latin capital letter A with acute */; case 194: return "Â" /* Â Latin capital letter A with circumflex */; case 195: return "Ã" /* Ã Latin capital letter A with tilde */; case 196: return "Ä" /* Ä Latin capital letter A with diaeresis */; case 197: return "Å" /* Å Latin capital letter A with ring above */; case 198: return "Æ" /* Æ Latin capital letter AE */; case 199: return "Ç" /* Ç Latin capital letter C with cedilla */; case 200: return "È" /* È Latin capital letter E with grave */; case 201: return "É" /* É Latin capital letter E with acute */; case 202: return "Ê" /* Ê Latin capital letter E with circumflex */; case 203: return "Ë" /* Ë Latin capital letter E with diaeresis */; case 204: return "Ì" /* Ì Latin capital letter I with grave */; case 205: return "Í" /* Í Latin capital letter I with acute */; case 206: return "Î" /* Î Latin capital letter I with circumflex */; case 207: return "Ï" /* Ï Latin capital letter I with diaeresis */; case 208: return "Ð" /* Ð Latin capital letter Eth */; case 209: return "Ñ" /* Ñ Latin capital letter N with tilde */; case 210: return "Ò" /* Ò Latin capital letter O with grave */; case 211: return "Ó" /* Ó Latin capital letter O with acute */; case 212: return "Ô" /* Ô Latin capital letter O with circumflex */; case 213: return "Õ" /* Õ Latin capital letter O with tilde */; case 214: return "Ö" /* Ö Latin capital letter O with diaeresis */; case 215: return "×" /* × multiplication sign */; case 216: return "Ø" /* Ø Latin capital letter O with stroke */; case 217: return "Ù" /* Ù Latin capital letter U with grave */; case 218: return "Ú" /* Ú Latin capital letter U with acute */; case 219: return "Û" /* Û Latin capital letter U with circumflex */; case 220: return "Ü" /* Ü Latin capital letter U with diaeresis */; case 221: return "Ý" /* Ý Latin capital letter Y with acute */; case 222: return "Þ" /* Þ Latin capital letter Thorn */; case 223: return "ß" /* ß Latin small letter sharp s */; case 224: return "à" /* à Latin small letter a with grave */; case 225: return "á" /* á Latin small letter a with acute */; case 226: return "â" /* â Latin small letter a with circumflex */; case 227: return "ã" /* ã Latin small letter a with tilde */; case 228: return "ä" /* ä Latin small letter a with diaeresis */; case 229: return "å" /* å Latin small letter a with ring above */; case 230: return "æ" /* æ Latin lowercase ligature ae */; case 231: return "ç" /* ç Latin small letter c with cedilla */; case 232: return "è" /* è Latin small letter e with grave */; case 233: return "é" /* é Latin small letter e with acute */; case 234: return "ê" /* ê Latin small letter e with circumflex */; case 235: return "ë" /* ë Latin small letter e with diaeresis */; case 236: return "ì" /* ì Latin small letter i with grave */; case 237: return "í" /* í Latin small letter i with acute */; case 238: return "î" /* î Latin small letter i with circumflex */; case 239: return "ï" /* ï Latin small letter i with diaeresis */; case 240: return "ð" /* ð Latin small letter eth */; case 241: return "ñ" /* ñ Latin small letter n with tilde */; case 242: return "ò" /* ò Latin small letter o with grave */; case 243: return "ó" /* ó Latin small letter o with acute */; case 244: return "ô" /* ô Latin small letter o with circumflex */; case 245: return "õ" /* õ Latin small letter o with tilde */; case 246: return "ö" /* ö Latin small letter o with diaeresis */; case 247: return "÷" /* ÷ division sign */; case 248: return "ø" /* ø Latin small letter o with stroke */; case 249: return "ù" /* ù Latin small letter u with grave */; case 250: return "ú" /* ú Latin small letter u with acute */; case 251: return "û" /* û Latin small letter u with circumflex */; case 252: return "ü" /* ü Latin small letter u with diaeresis */; case 253: return "ý" /* ý Latin small letter y with acute */; case 254: return "þ" /* þ Latin small letter thorn */; case 255: return "ÿ" /* ÿ Latin small letter y with diaeresis */; case 338: return "Œ" /* Œ Latin capital ligature oe */; case 339: return "œ" /* œ Latin small ligature oe */; case 352: return "Š" /* Š Latin capital letter S with caron */; case 353: return "š" /* š Latin small letter s with caron */; case 376: return "Ÿ" /* Ÿ Latin capital letter Y with diaeresis */; case 402: return "ƒ" /* ƒ Latin small letter f with hook */; case 710: return "ˆ" /* ˆ modifier letter circumflex accent */; case 732: return "˜" /* ˜ small tilde */; case 913: return "Α" /* Α Greek capital letter Alpha */; case 914: return "Β" /* Β Greek capital letter Beta */; case 915: return "Γ" /* Γ Greek capital letter Gamma */; case 916: return "Δ" /* Δ Greek capital letter Delta */; case 917: return "Ε" /* Ε Greek capital letter Epsilon */; case 918: return "Ζ" /* Ζ Greek capital letter Zeta */; case 919: return "Η" /* Η Greek capital letter Eta */; case 920: return "Θ" /* Θ Greek capital letter Theta */; case 921: return "Ι" /* Ι Greek capital letter Iota */; case 922: return "Κ" /* Κ Greek capital letter Kappa */; case 923: return "Λ" /* Λ Greek capital letter Lambda */; case 924: return "Μ" /* Μ Greek capital letter Mu */; case 925: return "Ν" /* Ν Greek capital letter Nu */; case 926: return "Ξ" /* Ξ Greek capital letter Xi */; case 927: return "Ο" /* Ο Greek capital letter Omicron */; case 928: return "Π" /* Π Greek capital letter Pi */; case 929: return "Ρ" /* Ρ Greek capital letter Rho */; case 931: return "Σ" /* Σ Greek capital letter Sigma */; case 932: return "Τ" /* Τ Greek capital letter Tau */; case 933: return "Υ" /* Υ Greek capital letter Upsilon */; case 934: return "Φ" /* Φ Greek capital letter Phi */; case 935: return "Χ" /* Χ Greek capital letter Chi */; case 936: return "Ψ" /* Ψ Greek capital letter Psi */; case 937: return "Ω" /* Ω Greek capital letter Omega */; case 945: return "α" /* α Greek small letter alpha */; case 946: return "β" /* β Greek small letter beta */; case 947: return "γ" /* γ Greek small letter gamma */; case 948: return "δ" /* δ Greek small letter delta */; case 949: return "ε" /* ε Greek small letter epsilon */; case 950: return "ζ" /* ζ Greek small letter zeta */; case 951: return "η" /* η Greek small letter eta */; case 952: return "θ" /* θ Greek small letter theta */; case 953: return "ι" /* ι Greek small letter iota */; case 954: return "κ" /* κ Greek small letter kappa */; case 955: return "λ" /* λ Greek small letter lambda */; case 956: return "μ" /* μ Greek small letter mu */; case 957: return "ν" /* ν Greek small letter nu */; case 958: return "ξ" /* ξ Greek small letter xi */; case 959: return "ο" /* ο Greek small letter omicron */; case 960: return "π" /* π Greek small letter pi */; case 961: return "ρ" /* ρ Greek small letter rho */; case 962: return "ς" /* ς Greek small letter final sigma */; case 963: return "σ" /* σ Greek small letter sigma */; case 964: return "τ" /* τ Greek small letter tau */; case 965: return "υ" /* υ Greek small letter upsilon */; case 966: return "φ" /* φ Greek small letter phi */; case 967: return "χ" /* χ Greek small letter chi */; case 968: return "ψ" /* ψ Greek small letter psi */; case 969: return "ω" /* ω Greek small letter omega */; case 977: return "ϑ" /* ϑ Greek theta symbol */; case 978: return "ϒ" /* ϒ Greek upsilon with hook symbol */; case 982: return "ϖ" /* ϖ Greek pi symbol */; case 8194: return " " /* en space */; case 8195: return " " /* em space */; case 8201: return " " /* thin space */; case 8204: return "‌" /* ‌ zero width non-joiner */; case 8205: return "‍" /* ‍ zero width joiner */; case 8206: return "‎" /* ‎ left-to-right mark */; case 8207: return "‏" /* ‏ right-to-left mark */; case 8211: return "–" /* – en dash */; case 8212: return "—" /* — em dash */; case 8216: return "‘" /* ‘ left single-6 quotation mark */; case 8217: return "’" /* ’ right single-9 quotation mark */; case 8218: return "‚" /* ‚ single low-9 quotation mark */; case 8220: return "“" /* “ left double-66 quotation mark */; case 8221: return "”" /* ” right double-99 quotation mark */; case 8222: return "„" /* „ double low-99 quotation mark */; case 8224: return "†" /* † dagger */; case 8225: return "‡" /* ‡ double dagger */; case 8226: return "•" /* • bullet */; case 8230: return "…" /* … horizontal ellipsis */; case 8240: return "‰" /* ‰ per mille sign */; case 8242: return "′" /* ′ prime */; case 8243: return "″" /* ″ double prime */; case 8249: return "‹" /* ‹ single left-pointing angle quotation mark */; case 8250: return "›" /* › single right-pointing angle quotation mark */; case 8254: return "‾" /* ‾ overline */; case 8260: return "⁄" /* ⁄ fraction slash */; case 8364: return "€" /* € Euro currency sign */; case 8465: return "ℑ" /* ℑ black-letter capital i */; case 8472: return "℘" /* ℘ script capital p */; case 8476: return "ℜ" /* ℜ black-letter capital r */; case 8482: return "™" /* ™ trademark sign */; case 8501: return "ℵ" /* ℵ alef symbol */; case 8592: return "←" /* ← leftwards arrow */; case 8593: return "↑" /* ↑ upwards arrow */; case 8594: return "→" /* → rightwards arrow */; case 8595: return "↓" /* ↓ downwards arrow */; case 8596: return "↔" /* ↔ left right arrow */; case 8629: return "↵" /* ↵ downwards arrow with corner leftwards */; case 8656: return "⇐" /* ⇐ leftwards double arrow */; case 8657: return "⇑" /* ⇑ upwards double arrow */; case 8658: return "⇒" /* ⇒ rightwards double arrow */; case 8659: return "⇓" /* ⇓ downwards double arrow */; case 8660: return "⇔" /* ⇔ left right double arrow */; case 8704: return "∀" /* ∀ for all */; case 8706: return "∂" /* ∂ partial differential */; case 8707: return "∃" /* ∃ there exists */; case 8709: return "∅" /* ∅ empty set */; case 8711: return "∇" /* ∇ nabla */; case 8712: return "∈" /* ∈ element of */; case 8713: return "∉" /* ∉ not an element of */; case 8715: return "∋" /* ∋ like backwards epsilon */; case 8719: return "∏" /* ∏ n-ary product */; case 8721: return "∑" /* ∑ n-ary summation */; case 8722: return "−" /* − minus sign */; case 8727: return "∗" /* ∗ asterisk operator */; case 8730: return "√" /* √ square root */; case 8733: return "∝" /* ∝ proportional to */; case 8734: return "∞" /* ∞ infinity */; case 8736: return "∠" /* ∠ angle */; case 8743: return "∧" /* ∧ logical and */; case 8744: return "∨" /* ∨ vee */; case 8745: return "∩" /* ∩ intersection */; case 8746: return "∪" /* ∪ union */; case 8747: return "∫" /* ∫ integral */; case 8756: return "∴" /* ∴ therefore three dots */; case 8764: return "∼" /* ∼ tilde operator */; case 8773: return "≅" /* ≅ congruent to */; case 8776: return "≈" /* ≈ asymptotic to */; case 8800: return "≠" /* ≠ not equal to */; case 8801: return "≡" /* ≡ identical to */; case 8804: return "≤" /* ≤ less-than or equal to */; case 8805: return "≥" /* ≥ greater-than or equal to */; case 8834: return "⊂" /* ⊂ subset of */; case 8835: return "⊃" /* ⊃ superset of */; case 8836: return "⊄" /* ⊄ not a subset of */; case 8838: return "⊆" /* ⊆ subset of or equal to */; case 8839: return "⊇" /* ⊇ superset of or equal to */; case 8853: return "⊕" /* ⊕ circled plus */; case 8855: return "⊗" /* ⊗ circled times */; case 8869: return "⊥" /* ⊥ up tack */; case 8901: return "⋅" /* ⋅ dot operator */; case 8968: return "⌈" /* ⌈ left ceiling */; case 8969: return "⌉" /* ⌉ right ceiling */; case 8970: return "⌊" /* ⌊ left floor */; case 8971: return "⌋" /* ⌋ right floor */; case 9001: return "⟨" /* 〈 left-pointing angle bracket */; case 9002: return "⟩" /* 〉 right-pointing angle bracket */; case 9674: return "◊" /* ◊ open lozenge */; case 9824: return "♠" /* ♠ black spade suit */; case 9827: return "♣" /* ♣ black club suit */; case 9829: return "♥" /* ♥ black heart suit */; case 9830: return "♦" /* ♦ black diamond suit */; } // end switch // can't fall out bottom } // end charToEntity /** * convert a single char to its equivalent XML entity. Ordinary chars are not changed. 160 -> weird chars * -> &#nnn; form * * @param c Char to convert * * @return equivalent string e.g. &, null means leave char as is. Does not return unmodified letters. */ private static String charToXMLEntity ( char c ) { switch ( c ) { default: if ( c < 127 ) { // leave alone as equivalent string. return null; // faster than String.valueOf( c ).intern(); } else { // use the &#nnn; form return "&#" + Integer.toString( c ) + ";"; } // do NOT modify the following code. It is not generated. case 34: return """/* " quotation mark */; case 38: return "&"/* & ampersand */; // case 39: // don't use apos, to make more compatible with HTML // return "'"/* ' apos */ case 60: return "<"/* < less-than sign */; case 62: return ">"/* > greater-than sign */; case 160: return " "; } // end switch // can't fall out bottom } // end charToXMLEntity /** * Converts text to HTML by quoting dangerous characters. Text must not already contain entities. e.g. " ==> " * < ==> < ordinary text passes unchanged. Does not convert space to * * @param text raw text to be processed. Must not be null. * @param xmlStyle true if insert basic four XML enities, otherwise use full HTML-4 set. * * @return translated text, or null if input is null. * @noinspection WeakerAccess */ private static String entifyHTMLorXML( final String text, final boolean xmlStyle ) { if ( text == null ) { return null; } int originalTextLength = text.length(); // estimate text will grow by no more than 10% StringBuilder sb = new StringBuilder( originalTextLength * 110 / 100 ); int charsToAppend = 0; for ( int i = 0; i < originalTextLength; i++ ) { char c = text.charAt( i ); String entity = xmlStyle ? charToXMLEntity( c ) : charToHTMLEntity( c ); if ( entity == null ) { // we could sb.append( c ), but that would be slower // than saving them up for a big append. charsToAppend++; } else { if ( charsToAppend != 0 ) { sb.append( text.substring( i - charsToAppend, i ) ); charsToAppend = 0; } sb.append( entity ); } } // end for // append chars to the right of the last entity. if ( charsToAppend != 0 ) { sb.append( text.substring( originalTextLength - charsToAppend, originalTextLength ) ); } // if result is not longer, we did not do anything. Save RAM by returning the original return ( sb.length() == originalTextLength ) ? text : sb.toString(); } /** * Converts text to HTML by quoting dangerous characters. Text must not already contain entities. e.g. " ==> " * < ==> < ordinary text passes unchanged. Does not convert space to * * @param text raw text to be processed. Must not be null. * * @return translated HTML text, or null if input is null. * @noinspection WeakerAccess */ public static String entifyHTML( String text ) { return entifyHTMLorXML( text, false ); } /** * Converts text to XML by quoting dangerous characters. * Text must not already contain entities. e.g. " ==> " * < ==> < ordinary text passes unchanged. Does not convert space to * * @param text raw text to be processed. Must not be null. * * @return translated XML text, or null if input is null. * @noinspection WeakerAccess */ public static String entifyXML( String text ) { return entifyHTMLorXML( text, true ); } /** * Can this character be included in an HTML document without entifying it? * * @param c character is question * * @return true if char is simple, false if char needs entity */ public static boolean isSimple( char c ) { return ' ' <= c && c <= '~' && !( c == '\"' || c == '&' || c == '<' || c == '>' ); } /** * We should generate hex entities in preference to decimal ones. */ public static void preferDecimalEntities() { preferHexEntities = false; } /** * We should generate decimal entities in preference to hex ones. */ public static void preferHexEntities() { preferHexEntities = true; } /** * Converts text to HTML by quoting dangerous characters.; * Does not convert space to * * @param c raw character. * * @return translated HTML text, eg. & -> & x -> x * @noinspection WeakerAccess */ public static String toHTMLEntity( char c ) { final String result = charToHTMLEntity( c ); return result == null ? String.valueOf( c ) : result; } /** * Converts text to HTML by converting all characters to 香 form * * @param c raw character. * * @return translated HTML text, eg. space --> * @noinspection WeakerAccess */ public static String toHexEntity( char c ) { return "&#x" + Integer.toHexString( c ); } /** * Converts text to XML by quoting dangerous characters.; * Does not convert space to * * @param c raw character. * * @return translated HTML text, eg. < -> < x -> x * @noinspection WeakerAccess */ public static String toXMLEntity( char c ) { final String result = charToXMLEntity( c ); return result == null ? String.valueOf( c ) : result; } // /** test harness // // * @param args not used // */ // public static void main ( String[] args ) // { // // String s = "abc\"&xyz"; // out.println( entifyHTMLorXML( s , false )) ; // } }