/* * [DeEntifyStrings.java] * * Summary: Strips HTML entities such as " from a string, replacing them by their Unicode equivalents. * * Copyright: (c) 2002-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.6 2009-04-05 StripEntities now leaves a space behind when it removes a

etc tag. * 2.7 2009-11-14 generate a table for the HTML cheat sheet of quote-like entities. * 2.8 2009-12-22 export table on HTML 5 entities. Now import csv file rather than embed entity facts. * 2.9 2010-01-29 export XHTML entities (currently same as HTML-4 entities). * 3.0 2011-02-10 rename to deEntify, delete deprecated methods. * 3.1 2011-09-02 correct error in tables reversing Y dieresis and y dieresis. Correctupper/lower categories in table. */ package com.mindprod.entities; import com.mindprod.common18.ST; import java.util.HashMap; /** * Strips HTML entities such as " from a string, replacing them by their Unicode equivalents. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2011-09-02 correct error in tables reversing Y dierisis and y dieresis. Corect upper/lower categories * in tabel. * @see DeEntify * @see DeEntifyStrings * @see Entify * @see EntifyStrings * @see Flatten * @since 2002-07-14 */ public class DeEntifyStrings { /** * unicode nbsp control char, 160, 0x0a. */ @SuppressWarnings( { "WeakerAccess" } ) public static final char UNICODE_NBSP_160_0x0a = 160; /** * Longest an HTML4 entity can be, at least in our tables, including the lead & and trail ;. * Note HTM4 longest entity is {@value #LONGEST_HTML4_ENTITY}. * * @noinspection WeakerAccess, JavadocReference, WeakerAccess */ public static final int LONGEST_HTML4_ENTITY = "ϑ".length(); /** * Longest an HTML5 entity can be, at least in our tables, including the lead & and trail ;. * Note HTML5 longest entity is {@value #LONGEST_HTML5_ENTITY},. * * @noinspection WeakerAccess, JavadocReference, WeakerAccess */ public static final int LONGEST_HTML5_ENTITY = "∳".length(); /** * The shortest an entity can be {@value #SHORTEST_HTML4_ENTITY}, at least in our tables, including the lead & and * trailing ;. * * @noinspection WeakerAccess, JavadocReference, WeakerAccess */ public static final int SHORTEST_HTML4_ENTITY = 4;/* < */ /** * The shortest an entity can be {@value #SHORTEST_HTML5_ENTITY}, at least in our tables, including the lead & and * trailing ;. * * @noinspection WeakerAccess, JavadocReference, WeakerAccess */ public static final int SHORTEST_HTML5_ENTITY = 4;/* < */ /** * true to enable the testing code. */ private static final boolean DEBUGGING = false; /** * tags, that when removed should leave a space behind. */ private static final String[] spacingTags = { "tr", "td", "th", "p", "br", "dl", "dt", "li" }; /** * allows lookup by entity name, to get the corresponding char. * Loaded from two hard-coded generated arrays burning into this class. * Does not deal with HTML5 entities. */ private static final HashMap entityToChar; static { // build HashMap to look up entity name to get corresponding Unicode // char number. Following code generated by Entities. String[] entityKeys = { // W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E // generated by Entities. Insert from com\mindprod\entities\entitiesjustkeys.javafrag "AElig" /* 198 : Æ Latin capital letter AE */, "Aacute" /* 193 : Á Latin capital letter A with acute */, "Acirc" /* 194 : Â Latin capital letter A with circumflex */, "Agrave" /* 192 : À Latin capital letter A with grave */, "Alpha" /* 913 : Α Greek capital letter Alpha */, "Aring" /* 197 : Å Latin capital letter A with ring above */, "Atilde" /* 195 : Ã Latin capital letter A with tilde */, "Auml" /* 196 : Ä Latin capital letter A with diaeresis */, "Beta" /* 914 : Β Greek capital letter Beta */, "Ccedil" /* 199 : Ç Latin capital letter C with cedilla */, "Chi" /* 935 : Χ Greek capital letter Chi */, "Dagger" /* 8225 : ‡ double dagger */, "Delta" /* 916 : Δ Greek capital letter Delta */, "ETH" /* 208 : Ð Latin capital letter Eth */, "Eacute" /* 201 : É Latin capital letter E with acute */, "Ecirc" /* 202 : Ê Latin capital letter E with circumflex */, "Egrave" /* 200 : È Latin capital letter E with grave */, "Epsilon" /* 917 : Ε Greek capital letter Epsilon */, "Eta" /* 919 : Η Greek capital letter Eta */, "Euml" /* 203 : Ë Latin capital letter E with diaeresis */, "Gamma" /* 915 : Γ Greek capital letter Gamma */, "Iacute" /* 205 : Í Latin capital letter I with acute */, "Icirc" /* 206 : Î Latin capital letter I with circumflex */, "Igrave" /* 204 : Ì Latin capital letter I with grave */, "Iota" /* 921 : Ι Greek capital letter Iota */, "Iuml" /* 207 : Ï Latin capital letter I with diaeresis */, "Kappa" /* 922 : Κ Greek capital letter Kappa */, "Lambda" /* 923 : Λ Greek capital letter Lambda */, "Mu" /* 924 : Μ Greek capital letter Mu */, "Ntilde" /* 209 : Ñ Latin capital letter N with tilde */, "Nu" /* 925 : Ν Greek capital letter Nu */, "OElig" /* 338 : Œ Latin capital ligature oe */, "Oacute" /* 211 : Ó Latin capital letter O with acute */, "Ocirc" /* 212 : Ô Latin capital letter O with circumflex */, "Ograve" /* 210 : Ò Latin capital letter O with grave */, "Omega" /* 937 : Ω Greek capital letter Omega */, "Omicron" /* 927 : Ο Greek capital letter Omicron */, "Oslash" /* 216 : Ø Latin capital letter O with stroke */, "Otilde" /* 213 : Õ Latin capital letter O with tilde */, "Ouml" /* 214 : Ö Latin capital letter O with diaeresis */, "Phi" /* 934 : Φ Greek capital letter Phi */, "Pi" /* 928 : Π Greek capital letter Pi */, "Prime" /* 8243 : ″ double prime */, "Psi" /* 936 : Ψ Greek capital letter Psi */, "Rho" /* 929 : Ρ Greek capital letter Rho */, "Scaron" /* 352 : Š Latin capital letter S with caron */, "Sigma" /* 931 : Σ Greek capital letter Sigma */, "THORN" /* 222 : Þ Latin capital letter Thorn */, "Tau" /* 932 : Τ Greek capital letter Tau */, "Theta" /* 920 : Θ Greek capital letter Theta */, "Uacute" /* 218 : Ú Latin capital letter U with acute */, "Ucirc" /* 219 : Û Latin capital letter U with circumflex */, "Ugrave" /* 217 : Ù Latin capital letter U with grave */, "Upsilon" /* 933 : Υ Greek capital letter Upsilon */, "Uuml" /* 220 : Ü Latin capital letter U with diaeresis */, "Xi" /* 926 : Ξ Greek capital letter Xi */, "Yacute" /* 221 : Ý Latin capital letter Y with acute */, "Yuml" /* 376 : Ÿ Latin capital letter Y with diaeresis */, "Zeta" /* 918 : Ζ Greek capital letter Zeta */, "aacute" /* 225 : á Latin small letter a with acute */, "acirc" /* 226 : â Latin small letter a with circumflex */, "acute" /* 180 : ´ acute accent */, "aelig" /* 230 : æ Latin lowercase ligature ae */, "agrave" /* 224 : à Latin small letter a with grave */, "alefsym" /* 8501 : ℵ alef symbol */, "alpha" /* 945 : α Greek small letter alpha */, "amp" /* 38 : & ampersand */, "and" /* 8743 : ∧ logical and */, "ang" /* 8736 : ∠ angle */, "aring" /* 229 : å Latin small letter a with ring above */, "asymp" /* 8776 : ≈ asymptotic to */, "atilde" /* 227 : ã Latin small letter a with tilde */, "auml" /* 228 : ä Latin small letter a with diaeresis */, "bdquo" /* 8222 : „ double low-99 quotation mark */, "beta" /* 946 : β Greek small letter beta */, "brvbar" /* 166 : ¦ broken bar */, "bull" /* 8226 : • bullet */, "cap" /* 8745 : ∩ intersection */, "ccedil" /* 231 : ç Latin small letter c with cedilla */, "cedil" /* 184 : ¸ cedilla */, "cent" /* 162 : ¢ cent sign */, "chi" /* 967 : χ Greek small letter chi */, "circ" /* 710 : ˆ modifier letter circumflex accent */, "clubs" /* 9827 : ♣ black club suit */, "cong" /* 8773 : ≅ congruent to */, "copy" /* 169 : © copyright sign circled c */, "crarr" /* 8629 : ↵ downwards arrow with corner leftwards */, "cup" /* 8746 : ∪ union */, "curren" /* 164 : ¤ currency sign */, "dArr" /* 8659 : ⇓ downwards double arrow */, "dagger" /* 8224 : † dagger */, "darr" /* 8595 : ↓ downwards arrow */, "deg" /* 176 : ° degree sign */, "delta" /* 948 : δ Greek small letter delta */, "diams" /* 9830 : ♦ black diamond suit */, "divide" /* 247 : ÷ division sign */, "eacute" /* 233 : é Latin small letter e with acute */, "ecirc" /* 234 : ê Latin small letter e with circumflex */, "egrave" /* 232 : è Latin small letter e with grave */, "empty" /* 8709 : ∅ empty set */, "emsp" /* 8195 : em space */, "ensp" /* 8194 : en space */, "epsilon" /* 949 : ε Greek small letter epsilon */, "equiv" /* 8801 : ≡ identical to */, "eta" /* 951 : η Greek small letter eta */, "eth" /* 240 : ð Latin small letter eth */, "euml" /* 235 : ë Latin small letter e with diaeresis */, "euro" /* 8364 : € Euro currency sign */, "exist" /* 8707 : ∃ there exists */, "fnof" /* 402 : ƒ Latin small letter f with hook */, "forall" /* 8704 : ∀ for all */, "frac12" /* 189 : ½ vulgar fraction 1/2 */, "frac14" /* 188 : ¼ vulgar fraction 1/4 */, "frac34" /* 190 : ¾ vulgar fraction 3/4 */, "frasl" /* 8260 : ⁄ fraction slash */, "gamma" /* 947 : γ Greek small letter gamma */, "ge" /* 8805 : ≥ greater-than or equal to */, "gt" /* 62 : > greater-than sign */, "hArr" /* 8660 : ⇔ left right double arrow */, "harr" /* 8596 : ↔ left right arrow */, "hearts" /* 9829 : ♥ black heart suit */, "hellip" /* 8230 : … horizontal ellipsis */, "iacute" /* 237 : í Latin small letter i with acute */, "icirc" /* 238 : î Latin small letter i with circumflex */, "iexcl" /* 161 : ¡ inverted exclamation mark */, "igrave" /* 236 : ì Latin small letter i with grave */, "image" /* 8465 : ℑ black-letter capital i */, "infin" /* 8734 : ∞ infinity */, "int" /* 8747 : ∫ integral */, "iota" /* 953 : ι Greek small letter iota */, "iquest" /* 191 : ¿ inverted question mark */, "isin" /* 8712 : ∈ element of */, "iuml" /* 239 : ï Latin small letter i with diaeresis */, "kappa" /* 954 : κ Greek small letter kappa */, "lArr" /* 8656 : ⇐ leftwards double arrow */, "lambda" /* 955 : λ Greek small letter lambda */, "lang" /* 9001 : 〈 left-pointing angle bracket */, "laquo" /* 171 : « left guillemot */, "larr" /* 8592 : ← leftwards arrow */, "lceil" /* 8968 : ⌈ left ceiling */, "ldquo" /* 8220 : “ left double-66 quotation mark */, "le" /* 8804 : ≤ less-than or equal to */, "lfloor" /* 8970 : ⌊ left floor */, "lowast" /* 8727 : ∗ asterisk operator */, "loz" /* 9674 : ◊ open lozenge */, "lrm" /* 8206 : ‎ left-to-right mark */, "lsaquo" /* 8249 : ‹ single left-pointing angle quotation mark */, "lsquo" /* 8216 : ‘ left single-6 quotation mark */, "lt" /* 60 : < less-than sign */, "macr" /* 175 : ¯ macron */, "mdash" /* 8212 : — em dash */, "micro" /* 181 : µ micro sign */, "middot" /* 183 : · middle dot */, "minus" /* 8722 : − minus sign */, "mu" /* 956 : μ Greek small letter mu */, "nabla" /* 8711 : ∇ nabla */, "nbsp" /* 160 : non-breaking space */, "ndash" /* 8211 : – en dash */, "ne" /* 8800 : ≠ not equal to */, "ni" /* 8715 : ∋ like backwards epsilon */, "not" /* 172 : ¬ not sign */, "notin" /* 8713 : ∉ not an element of */, "nsub" /* 8836 : ⊄ not a subset of */, "ntilde" /* 241 : ñ Latin small letter n with tilde */, "nu" /* 957 : ν Greek small letter nu */, "oacute" /* 243 : ó Latin small letter o with acute */, "ocirc" /* 244 : ô Latin small letter o with circumflex */, "oelig" /* 339 : œ Latin small ligature oe */, "ograve" /* 242 : ò Latin small letter o with grave */, "oline" /* 8254 : ‾ overline */, "omega" /* 969 : ω Greek small letter omega */, "omicron" /* 959 : ο Greek small letter omicron */, "oplus" /* 8853 : ⊕ circled plus */, "or" /* 8744 : ∨ vee */, "ordf" /* 170 : ª feminine ordinal indicator */, "ordm" /* 186 : º masculine ordinal indicator */, "oslash" /* 248 : ø Latin small letter o with stroke */, "otilde" /* 245 : õ Latin small letter o with tilde */, "otimes" /* 8855 : ⊗ circled times */, "ouml" /* 246 : ö Latin small letter o with diaeresis */, "para" /* 182 : ¶ pilcrow sign */, "part" /* 8706 : ∂ partial differential */, "permil" /* 8240 : ‰ per mille sign */, "perp" /* 8869 : ⊥ up tack */, "phi" /* 966 : φ Greek small letter phi */, "pi" /* 960 : π Greek small letter pi */, "piv" /* 982 : ϖ Greek pi symbol */, "plusmn" /* 177 : ± plus-minus sign */, "pound" /* 163 : £ pound sign */, "prime" /* 8242 : ′ prime */, "prod" /* 8719 : ∏ n-ary product */, "prop" /* 8733 : ∝ proportional to */, "psi" /* 968 : ψ Greek small letter psi */, "quot" /* 34 : " quotation mark */, "rArr" /* 8658 : ⇒ rightwards double arrow */, "radic" /* 8730 : √ square root */, "rang" /* 9002 : 〉 right-pointing angle bracket */, "raquo" /* 187 : » right guillemot */, "rarr" /* 8594 : → rightwards arrow */, "rceil" /* 8969 : ⌉ right ceiling */, "rdquo" /* 8221 : ” right double-99 quotation mark */, "real" /* 8476 : ℜ black-letter capital r */, "reg" /* 174 : ® registered sign. circled R. */, "rfloor" /* 8971 : ⌋ right floor */, "rho" /* 961 : ρ Greek small letter rho */, "rlm" /* 8207 : ‏ right-to-left mark */, "rsaquo" /* 8250 : › single right-pointing angle quotation mark */, "rsquo" /* 8217 : ’ right single-9 quotation mark */, "sbquo" /* 8218 : ‚ single low-9 quotation mark */, "scaron" /* 353 : š Latin small letter s with caron */, "sdot" /* 8901 : ⋅ dot operator */, "sect" /* 167 : § section sign */, "shy" /* 173 : soft hyphen */, "sigma" /* 963 : σ Greek small letter sigma */, "sigmaf" /* 962 : ς Greek small letter final sigma */, "sim" /* 8764 : ∼ tilde operator */, "spades" /* 9824 : ♠ black spade suit */, "sub" /* 8834 : ⊂ subset of */, "sube" /* 8838 : ⊆ subset of or equal to */, "sum" /* 8721 : ∑ n-ary summation */, "sup1" /* 185 : ¹ superscript one */, "sup2" /* 178 : ² superscript two */, "sup3" /* 179 : ³ superscript three */, "sup" /* 8835 : ⊃ superset of */, "supe" /* 8839 : ⊇ superset of or equal to */, "szlig" /* 223 : ß Latin small letter sharp s */, "tau" /* 964 : τ Greek small letter tau */, "there4" /* 8756 : ∴ therefore three dots */, "theta" /* 952 : θ Greek small letter theta */, "thetasym" /* 977 : ϑ Greek theta symbol */, "thinsp" /* 8201 : thin space */, "thorn" /* 254 : þ Latin small letter thorn */, "tilde" /* 732 : ˜ small tilde */, "times" /* 215 : × multiplication sign */, "trade" /* 8482 : ™ trademark sign */, "uArr" /* 8657 : ⇑ upwards double arrow */, "uacute" /* 250 : ú Latin small letter u with acute */, "uarr" /* 8593 : ↑ upwards arrow */, "ucirc" /* 251 : û Latin small letter u with circumflex */, "ugrave" /* 249 : ù Latin small letter u with grave */, "uml" /* 168 : ¨ diaeresis */, "upsih" /* 978 : ϒ Greek upsilon with hook symbol */, "upsilon" /* 965 : υ Greek small letter upsilon */, "uuml" /* 252 : ü Latin small letter u with diaeresis */, "weierp" /* 8472 : ℘ script capital p */, "xi" /* 958 : ξ Greek small letter xi */, "yacute" /* 253 : ý Latin small letter y with acute */, "yen" /* 165 : ¥ yen sign */, "yuml" /* 255 : ÿ Latin small letter y with diaeresis */, "zeta" /* 950 : ζ Greek small letter zeta */, "zwj" /* 8205 : ‍ zero width joiner */, "zwnj" /* 8204 : ‌ zero width non-joiner */, }; char[] entityValues = { // W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E // generated by Entities. Insert from com\mindprod\entities\entitiesjustkeys.javafrag 198 /* Æ : Æ Latin capital letter AE */, 193 /* Á : Á Latin capital letter A with acute */, 194 /* Â : Â Latin capital letter A with circumflex */, 192 /* À : À Latin capital letter A with grave */, 913 /* Α : Α Greek capital letter Alpha */, 197 /* Å : Å Latin capital letter A with ring above */, 195 /* Ã : Ã Latin capital letter A with tilde */, 196 /* Ä : Ä Latin capital letter A with diaeresis */, 914 /* Β : Β Greek capital letter Beta */, 199 /* Ç : Ç Latin capital letter C with cedilla */, 935 /* Χ : Χ Greek capital letter Chi */, 8225 /* ‡ : ‡ double dagger */, 916 /* Δ : Δ Greek capital letter Delta */, 208 /* Ð : Ð Latin capital letter Eth */, 201 /* É : É Latin capital letter E with acute */, 202 /* Ê : Ê Latin capital letter E with circumflex */, 200 /* È : È Latin capital letter E with grave */, 917 /* Ε : Ε Greek capital letter Epsilon */, 919 /* Η : Η Greek capital letter Eta */, 203 /* Ë : Ë Latin capital letter E with diaeresis */, 915 /* Γ : Γ Greek capital letter Gamma */, 205 /* Í : Í Latin capital letter I with acute */, 206 /* Î : Î Latin capital letter I with circumflex */, 204 /* Ì : Ì Latin capital letter I with grave */, 921 /* Ι : Ι Greek capital letter Iota */, 207 /* Ï : Ï Latin capital letter I with diaeresis */, 922 /* Κ : Κ Greek capital letter Kappa */, 923 /* Λ : Λ Greek capital letter Lambda */, 924 /* Μ : Μ Greek capital letter Mu */, 209 /* Ñ : Ñ Latin capital letter N with tilde */, 925 /* Ν : Ν Greek capital letter Nu */, 338 /* Œ : Œ Latin capital ligature oe */, 211 /* Ó : Ó Latin capital letter O with acute */, 212 /* Ô : Ô Latin capital letter O with circumflex */, 210 /* Ò : Ò Latin capital letter O with grave */, 937 /* Ω : Ω Greek capital letter Omega */, 927 /* Ο : Ο Greek capital letter Omicron */, 216 /* Ø : Ø Latin capital letter O with stroke */, 213 /* Õ : Õ Latin capital letter O with tilde */, 214 /* Ö : Ö Latin capital letter O with diaeresis */, 934 /* Φ : Φ Greek capital letter Phi */, 928 /* Π : Π Greek capital letter Pi */, 8243 /* ″ : ″ double prime */, 936 /* Ψ : Ψ Greek capital letter Psi */, 929 /* Ρ : Ρ Greek capital letter Rho */, 352 /* Š : Š Latin capital letter S with caron */, 931 /* Σ : Σ Greek capital letter Sigma */, 222 /* Þ : Þ Latin capital letter Thorn */, 932 /* Τ : Τ Greek capital letter Tau */, 920 /* Θ : Θ Greek capital letter Theta */, 218 /* Ú : Ú Latin capital letter U with acute */, 219 /* Û : Û Latin capital letter U with circumflex */, 217 /* Ù : Ù Latin capital letter U with grave */, 933 /* Υ : Υ Greek capital letter Upsilon */, 220 /* Ü : Ü Latin capital letter U with diaeresis */, 926 /* Ξ : Ξ Greek capital letter Xi */, 221 /* Ý : Ý Latin capital letter Y with acute */, 376 /* Ÿ : Ÿ Latin capital letter Y with diaeresis */, 918 /* Ζ : Ζ Greek capital letter Zeta */, 225 /* á : á Latin small letter a with acute */, 226 /* â : â Latin small letter a with circumflex */, 180 /* ´ : ´ acute accent */, 230 /* æ : æ Latin lowercase ligature ae */, 224 /* à : à Latin small letter a with grave */, 8501 /* ℵ : ℵ alef symbol */, 945 /* α : α Greek small letter alpha */, 38 /* & : & ampersand */, 8743 /* ∧ : ∧ logical and */, 8736 /* ∠ : ∠ angle */, 229 /* å : å Latin small letter a with ring above */, 8776 /* ≈ : ≈ asymptotic to */, 227 /* ã : ã Latin small letter a with tilde */, 228 /* ä : ä Latin small letter a with diaeresis */, 8222 /* „ : „ double low-99 quotation mark */, 946 /* β : β Greek small letter beta */, 166 /* ¦ : ¦ broken bar */, 8226 /* • : • bullet */, 8745 /* ∩ : ∩ intersection */, 231 /* ç : ç Latin small letter c with cedilla */, 184 /* ¸ : ¸ cedilla */, 162 /* ¢ : ¢ cent sign */, 967 /* χ : χ Greek small letter chi */, 710 /* ˆ : ˆ modifier letter circumflex accent */, 9827 /* ♣ : ♣ black club suit */, 8773 /* ≅ : ≅ congruent to */, 169 /* © : © copyright sign circled c */, 8629 /* ↵ : ↵ downwards arrow with corner leftwards */, 8746 /* ∪ : ∪ union */, 164 /* ¤ : ¤ currency sign */, 8659 /* ⇓ : ⇓ downwards double arrow */, 8224 /* † : † dagger */, 8595 /* ↓ : ↓ downwards arrow */, 176 /* ° : ° degree sign */, 948 /* δ : δ Greek small letter delta */, 9830 /* ♦ : ♦ black diamond suit */, 247 /* ÷ : ÷ division sign */, 233 /* é : é Latin small letter e with acute */, 234 /* ê : ê Latin small letter e with circumflex */, 232 /* è : è Latin small letter e with grave */, 8709 /* ∅ : ∅ empty set */, 8195 /* : em space */, 8194 /* : en space */, 949 /* ε : ε Greek small letter epsilon */, 8801 /* ≡ : ≡ identical to */, 951 /* η : η Greek small letter eta */, 240 /* ð : ð Latin small letter eth */, 235 /* ë : ë Latin small letter e with diaeresis */, 8364 /* € : € Euro currency sign */, 8707 /* ∃ : ∃ there exists */, 402 /* ƒ : ƒ Latin small letter f with hook */, 8704 /* ∀ : ∀ for all */, 189 /* ½ : ½ vulgar fraction 1/2 */, 188 /* ¼ : ¼ vulgar fraction 1/4 */, 190 /* ¾ : ¾ vulgar fraction 3/4 */, 8260 /* ⁄ : ⁄ fraction slash */, 947 /* γ : γ Greek small letter gamma */, 8805 /* ≥ : ≥ greater-than or equal to */, 62 /* > : > greater-than sign */, 8660 /* ⇔ : ⇔ left right double arrow */, 8596 /* ↔ : ↔ left right arrow */, 9829 /* ♥ : ♥ black heart suit */, 8230 /* … : … horizontal ellipsis */, 237 /* í : í Latin small letter i with acute */, 238 /* î : î Latin small letter i with circumflex */, 161 /* ¡ : ¡ inverted exclamation mark */, 236 /* ì : ì Latin small letter i with grave */, 8465 /* ℑ : ℑ black-letter capital i */, 8734 /* ∞ : ∞ infinity */, 8747 /* ∫ : ∫ integral */, 953 /* ι : ι Greek small letter iota */, 191 /* ¿ : ¿ inverted question mark */, 8712 /* ∈ : ∈ element of */, 239 /* ï : ï Latin small letter i with diaeresis */, 954 /* κ : κ Greek small letter kappa */, 8656 /* ⇐ : ⇐ leftwards double arrow */, 955 /* λ : λ Greek small letter lambda */, 9001 /* ⟨ : 〈 left-pointing angle bracket */, 171 /* « : « left guillemot */, 8592 /* ← : ← leftwards arrow */, 8968 /* ⌈ : ⌈ left ceiling */, 8220 /* “ : “ left double-66 quotation mark */, 8804 /* ≤ : ≤ less-than or equal to */, 8970 /* ⌊ : ⌊ left floor */, 8727 /* ∗ : ∗ asterisk operator */, 9674 /* ◊ : ◊ open lozenge */, 8206 /* ‎ : ‎ left-to-right mark */, 8249 /* ‹ : ‹ single left-pointing angle quotation mark */, 8216 /* ‘ : ‘ left single-6 quotation mark */, 60 /* < : < less-than sign */, 175 /* ¯ : ¯ macron */, 8212 /* — : — em dash */, 181 /* µ : µ micro sign */, 183 /* · : · middle dot */, 8722 /* − : − minus sign */, 956 /* μ : μ Greek small letter mu */, 8711 /* ∇ : ∇ nabla */, 160 /* : non-breaking space */, 8211 /* – : – en dash */, 8800 /* ≠ : ≠ not equal to */, 8715 /* ∋ : ∋ like backwards epsilon */, 172 /* ¬ : ¬ not sign */, 8713 /* ∉ : ∉ not an element of */, 8836 /* ⊄ : ⊄ not a subset of */, 241 /* ñ : ñ Latin small letter n with tilde */, 957 /* ν : ν Greek small letter nu */, 243 /* ó : ó Latin small letter o with acute */, 244 /* ô : ô Latin small letter o with circumflex */, 339 /* œ : œ Latin small ligature oe */, 242 /* ò : ò Latin small letter o with grave */, 8254 /* ‾ : ‾ overline */, 969 /* ω : ω Greek small letter omega */, 959 /* ο : ο Greek small letter omicron */, 8853 /* ⊕ : ⊕ circled plus */, 8744 /* ∨ : ∨ vee */, 170 /* ª : ª feminine ordinal indicator */, 186 /* º : º masculine ordinal indicator */, 248 /* ø : ø Latin small letter o with stroke */, 245 /* õ : õ Latin small letter o with tilde */, 8855 /* ⊗ : ⊗ circled times */, 246 /* ö : ö Latin small letter o with diaeresis */, 182 /* ¶ : ¶ pilcrow sign */, 8706 /* ∂ : ∂ partial differential */, 8240 /* ‰ : ‰ per mille sign */, 8869 /* ⊥ : ⊥ up tack */, 966 /* φ : φ Greek small letter phi */, 960 /* π : π Greek small letter pi */, 982 /* ϖ : ϖ Greek pi symbol */, 177 /* ± : ± plus-minus sign */, 163 /* £ : £ pound sign */, 8242 /* ′ : ′ prime */, 8719 /* ∏ : ∏ n-ary product */, 8733 /* ∝ : ∝ proportional to */, 968 /* ψ : ψ Greek small letter psi */, 34 /* " : " quotation mark */, 8658 /* ⇒ : ⇒ rightwards double arrow */, 8730 /* √ : √ square root */, 9002 /* ⟩ : 〉 right-pointing angle bracket */, 187 /* » : » right guillemot */, 8594 /* → : → rightwards arrow */, 8969 /* ⌉ : ⌉ right ceiling */, 8221 /* ” : ” right double-99 quotation mark */, 8476 /* ℜ : ℜ black-letter capital r */, 174 /* ® : ® registered sign. circled R. */, 8971 /* ⌋ : ⌋ right floor */, 961 /* ρ : ρ Greek small letter rho */, 8207 /* ‏ : ‏ right-to-left mark */, 8250 /* › : › single right-pointing angle quotation mark */, 8217 /* ’ : ’ right single-9 quotation mark */, 8218 /* ‚ : ‚ single low-9 quotation mark */, 353 /* š : š Latin small letter s with caron */, 8901 /* ⋅ : ⋅ dot operator */, 167 /* § : § section sign */, 173 /* : soft hyphen */, 963 /* σ : σ Greek small letter sigma */, 962 /* ς : ς Greek small letter final sigma */, 8764 /* ∼ : ∼ tilde operator */, 9824 /* ♠ : ♠ black spade suit */, 8834 /* ⊂ : ⊂ subset of */, 8838 /* ⊆ : ⊆ subset of or equal to */, 8721 /* ∑ : ∑ n-ary summation */, 185 /* ¹ : ¹ superscript one */, 178 /* ² : ² superscript two */, 179 /* ³ : ³ superscript three */, 8835 /* ⊃ : ⊃ superset of */, 8839 /* ⊇ : ⊇ superset of or equal to */, 223 /* ß : ß Latin small letter sharp s */, 964 /* τ : τ Greek small letter tau */, 8756 /* ∴ : ∴ therefore three dots */, 952 /* θ : θ Greek small letter theta */, 977 /* ϑ : ϑ Greek theta symbol */, 8201 /* : thin space */, 254 /* þ : þ Latin small letter thorn */, 732 /* ˜ : ˜ small tilde */, 215 /* × : × multiplication sign */, 8482 /* ™ : ™ trademark sign */, 8657 /* ⇑ : ⇑ upwards double arrow */, 250 /* ú : ú Latin small letter u with acute */, 8593 /* ↑ : ↑ upwards arrow */, 251 /* û : û Latin small letter u with circumflex */, 249 /* ù : ù Latin small letter u with grave */, 168 /* ¨ : ¨ diaeresis */, 978 /* ϒ : ϒ Greek upsilon with hook symbol */, 965 /* υ : υ Greek small letter upsilon */, 252 /* ü : ü Latin small letter u with diaeresis */, 8472 /* ℘ : ℘ script capital p */, 958 /* ξ : ξ Greek small letter xi */, 253 /* ý : ý Latin small letter y with acute */, 165 /* ¥ : ¥ yen sign */, 255 /* ÿ : ÿ Latin small letter y with diaeresis */, 950 /* ζ : ζ Greek small letter zeta */, 8205 /* ‍ : ‍ zero width joiner */, 8204 /* ‌ : ‌ zero width non-joiner */, }; // allow 50% extra space for faster lookup. entityToChar = new HashMap<>( entityKeys.length * 150 / 100 ); for ( int i = 0; i < entityKeys.length; i++ ) { // leave out nbsp so it can be specially handled if entity not found. if ( !entityKeys[ i ].equals( "nbsp" ) ) { entityToChar.put( entityKeys[ i ], entityValues[ i ] ); } // add also ' for strip but not insert. optional for XML, not used in HTML. entityToChar.put( "apos", ( char ) 39 ); } } // end static /** * Checks a number of gauntlet conditions to ensure this is a valid entity. Converts Entity to corresponding char. * Does not deal with HTML5 entities. * * @param possBareEntityWithSemicolon string that may hold an entity. Lead & must be stripped, * but may optionally contain text past the ; * @param translateNbspTo char you would like nbsp translated to, usually ' ' or (char) 160 . * * @return corresponding unicode character, or 0 if the entity is invalid. * @noinspection WeakerAccess */ protected static char possBareHTMLEntityWithSemicolonToChar( String possBareEntityWithSemicolon, char translateNbspTo ) { if ( possBareEntityWithSemicolon.length() < SHORTEST_HTML4_ENTITY - 1 ) { return 0; } // find the trailing ; int whereSemi = possBareEntityWithSemicolon .indexOf( ';', SHORTEST_HTML4_ENTITY - 2/* where start looking */ ); if ( whereSemi < SHORTEST_HTML4_ENTITY - 2 ) { return 0; } return bareHTMLEntityToChar( possBareEntityWithSemicolon.substring( 0, whereSemi ), translateNbspTo ); } /** * Prepares tags for removal, to ensure they are replaced by a space *

--> _ ' ' ) { // insert space before < sb.append( ' ' ); } break; } } } sb.append( c ); prevChar = c; } return sb.toString(); } /** * remove all text between <applet.. </applet>, <style... </style> <script... </script> * * @param s HTML string to strip tag pairs out of. * * @return string with tag pairs stripped out. */ private static String stripHTMLTagPairs( String s ) { String[] tags = { "applet", "APPLET", "style", "STYLE", "script", "SCRIPT" }; for ( final String tag : tags ) { final String beginTag = "<" + tag; final String endTag = ""; int begin = 0; while ( begin < s.length() && ( begin = s.indexOf( beginTag, begin ) ) >= 0 ) { final int end; if ( ( end = s.indexOf( endTag, begin + beginTag.length() ) ) > 0 ) { // chop out the s = s.substring( 0, begin ) + s.substring( end + endTag.length() ); } else { // no matching end tag, chop off entire end s = s.substring( 0, begin ); } } } return s; } /** * Removes tags from HTML leaving just the raw text. Leaves entities as is, e.g. Presumes perfectly formed HTML. * etc removed leaving nothing behind. * * @param html input HTML or XML * * @return raw text, with whitespaces collapsed to a single space, trimmed. * @noinspection WeakerAccess */ private static String stripIndividualTags( String html ) { html = html.trim(); // condition String so that some tags will always turn into space. html = preStripIndividualTags( html ); int numChars = html.length(); // will only shrink. Don't use FastCat final StringBuilder sb = new StringBuilder( numChars ); /** * are we inside a tag, eg. inside */ boolean inside = false; /** * Have we cleaned any White Space? */ boolean cleanedAnyWhitespace = false; /** * Was the last char we saw a space? We use this to collapse spaces. */ boolean lastCharSpace = false; for ( int i = 0; i < numChars; i++ ) { char c = html.charAt( i ); switch ( c ) { default: if ( c < ' ' ) { // handle stray whitespace if ( !inside ) { lastCharSpace = true; cleanedAnyWhitespace = true; } } else { // ordinary character, ignored inside a tag if ( !inside ) { if ( lastCharSpace ) { // deal with pending whitespace sb.append( ' ' ); lastCharSpace = false; } sb.append( c ); } } break; case '<': inside = true; // ignore break; case '>': inside = false; // ignore break; case ' ': if ( !inside ) { lastCharSpace = true; } break; // whitespace case '\r': case '\t': case '\n': case 127: case UNICODE_NBSP_160_0x0a: if ( !inside ) { lastCharSpace = true; cleanedAnyWhitespace = true; } break; } } // end for // return original string if we did not really change anything final String result = ( cleanedAnyWhitespace || sb.length() != numChars ) ? sb .toString() : html; return ST.condense( result ); // collapse multiple spaces. } /** * convert an entity to a single char. Does not deal with HTML5 entities. * * @param bareEntity String entity to convert convert. must have lead & and trail ; stripped; may have * form: #x12ff or #123 or lt or nbsp * style entity. Works faster if entity in lower case. * @param howToTranslateNbsp char you would like translated to, usually ' ' or (char) 160 * * @return equivalent character. 0 if not recognised. * @noinspection WeakerAccess */ public static char bareHTMLEntityToChar( String bareEntity, char howToTranslateNbsp ) { // first check for alpha entity Character code = entityToChar.get( bareEntity ); if ( code != null ) { return code; } code = entityToChar.get( bareEntity.toLowerCase() ); if ( code != null ) { return code; } // nbsp is not in hashMap. We test for it specially. if ( bareEntity.length() == 4 && bareEntity.equals( "nbsp" ) || bareEntity.equals( "NBSP" ) ) { return howToTranslateNbsp; } // check at least have &_#1_; (no & or ; at this point ) if ( bareEntity.length() < 2 ) { return 0; } try { if ( bareEntity.charAt( 0 ) == '#' ) { final char secondChar = bareEntity.charAt( 1 ); if ( secondChar == 'x' || secondChar == 'X' ) { // handle hex entities of form &_#x12ff_; // ensure at least have &_#xf_; if ( bareEntity.length() < 3 ) { return 0; } // had &_#x123D_; return ( char ) Integer.parseInt( bareEntity.substring( 2 ), /* hex */ 16 ); } else { // handle decimal entities // had &_#123_; return ( char ) Integer.parseInt( bareEntity.substring( 1 ) ); } } else { // some unrecognized/malformed bareEntity return 0; } } catch ( NumberFormatException e ) { return 0; } } // end entityToChar /** * Converts HTML to text converting entities such as " back to " and < back to < Ordinary text passes * unchanged. Also strips decimal and hex entities and stray HTML entities. Does not deal with HTML5 entities. * * @param text raw text to be processed. Must not be null. * @param translateNbspTo char you would like translated to, usually ' ' or (char) 160 . * * @return translated text. It also handles HTML 4.0 entities such as ♥ { and -> 160. * null input returns null. * @noinspection WeakerAccess */ public static String deEntifyHTML( String text, char translateNbspTo ) { if ( text == null ) { return null; } if ( text.indexOf( '&' ) < 0 ) { // are no entities, nothing to do return text; } int originalTextLength = text.length(); StringBuilder sb = new StringBuilder( originalTextLength ); for ( int i = 0; i < originalTextLength; i++ ) { int whereAmp = text.indexOf( '&', i ); if ( whereAmp < 0 ) { // no more &s, we are done // append all remaining text sb.append( text.substring( i ) ); break; } else { // append all text to left of next & sb.append( text.substring( i, whereAmp ) ); // avoid reprocessing those chars i = whereAmp; // text.charAt(i) is an & // possEntity has lead & stripped. String possEntity = text.substring( i + 1, Math.min( i + LONGEST_HTML4_ENTITY, text.length() ) ); char t = possBareHTMLEntityWithSemicolonToChar( possEntity, translateNbspTo ); if ( t != 0 ) { // was a good entity, keep its equivalent char. sb.append( t ); // avoid reprocessing chars forming the entity int whereSemi = possEntity.indexOf( ";", SHORTEST_HTML4_ENTITY - 2 ); i += whereSemi + 1; } else { // treat & just as ordinary character sb.append( '&' ); } } // end else } // end for // if result is not shorter, we did not do anything. Saves RAM. return ( sb.length() == originalTextLength ) ? text : sb.toString(); } // end stripEntities /** * Converts XML to text converting entities such as " back to " and < back to < Ordinary text passes * unchanged. Also strips decimal and hex entities and stray HTML entities. * * @param text raw XML text to be processed. Must not be null. * * @return translated text. null input returns null. * @noinspection WeakerAccess */ public static String deEntifyXML( String text ) { return deEntifyHTML( text, ' ' ); } /** * strips tags and entities from HTML. Does not deal with HTML5 entities. Leaves \n \r unchanged. * * @param text to flatten * @param translateNbspTo char you would like translated to, usually ' ' or (char) 160 . * * @return flattened text * @noinspection WeakerAccess */ public static String flattenHTML( String text, char translateNbspTo ) { return deEntifyHTML( stripHTMLTags( text ), translateNbspTo ); } /** * strips tags and entities from XML.. * * @param text to flatten * * @return flattened text * @noinspection WeakerAccess */ public static String flattenXML( String text ) { return deEntifyXML( stripXMLTags( text ) ); } /** * Checks a number of gauntlet conditions to ensure this is a valid entity. Converts Entity to corresponding char. * Does not deal with HTML5 entities. * * @param possBareEntityWithSemicolon string that may hold an entity. Lead & must be stripped, * but may optionally contain text past the ; * * @return corresponding unicode character, or 0 if the entity is invalid. nbsp -> (char) 160 * @noinspection WeakerAccess */ public static char possEntityToChar( String possBareEntityWithSemicolon ) { return possBareHTMLEntityWithSemicolonToChar( possBareEntityWithSemicolon, UNICODE_NBSP_160_0x0a ); } /** * Removes tags from HTML leaving just the raw text. Leaves entities as is, e.g. does not convert & back to &. * similar to code in Quoter. Also removes  comments. Presumes perfectly formed HTML, no > in * comments, all <...> balanced. Also removes text between applet, style and script tag pairs. * Leaves and other entities as is. Does not deal with HTML5 entities. * * @param html input HTML * * @return raw text, with whitespaces collapsed to a single space, trimmed. * @noinspection WeakerAccess */ public static String stripHTMLTags( String html ) { assert html != null : "attempt to strip HTML tags from a null String"; html = stripHTMLTagPairs( html ); return stripIndividualTags( html ); } /** * Removes tags from XML leaving just the raw text. Leaves entities as is, e.g. does not convert & back to &. * similar to code in Quoter. Also removes  comments. Presumes perfectly formed XML, no > in * comments, all <...> balanced. * Leaves entities as is. * * @param xml input XML * * @return raw text, with whitespaces collapsed to a single space, trimmed. * @noinspection WeakerAccess */ public static String stripXMLTags( String xml ) { assert xml != null : "attempt to strip XML tags from a null String"; return stripIndividualTags( xml ); } // /** // * Test harness. // // * @param args not used. // // * @noinspection ConstantConditions // */ // public static void main // ( String[] args ) // { // if ( DEBUGGING ) // { // out.println( deEntifyHTML( " Bed & Breakfast ", ' ' ) ); // out.println( stripHTMLTags( " big blue " ) ); // out.println( stripHTMLTags( "big\nblue" ) ); // out.println( stripHTMLTags( "big\nblue" ) ); // out.println( stripHTMLTags( "big blue" ) ); // out.println( stripHTMLTags( "big
blue" ) ); // out.println( stripHTMLTags( "big\n
blue" ) ); // } // } } // end DeEntifyStrings