/*
* [DeEntifyStrings.java]
*
* Summary: Strips HTML entities such as " from a string, replacing them by their Unicode equivalents.
*
* Copyright: (c) 2002-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 2.6 2009-04-05 StripEntities now leaves a space behind when it removes a
etc tag.
* 2.7 2009-11-14 generate a table for the HTML cheat sheet of quote-like entities.
* 2.8 2009-12-22 export table on HTML 5 entities. Now import csv file rather than embed entity facts.
* 2.9 2010-01-29 export XHTML entities (currently same as HTML-4 entities).
* 3.0 2011-02-10 rename to deEntify, delete deprecated methods.
* 3.1 2011-09-02 correct error in tables reversing Y dieresis and y dieresis. Correctupper/lower categories in table.
*/
package com.mindprod.entities;
import com.mindprod.common18.ST;
import java.util.HashMap;
/**
* Strips HTML entities such as " from a string, replacing them by their Unicode equivalents.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.1 2011-09-02 correct error in tables reversing Y dierisis and y dieresis. Corect upper/lower categories
* in tabel.
* @see DeEntify
* @see DeEntifyStrings
* @see Entify
* @see EntifyStrings
* @see Flatten
* @since 2002-07-14
*/
public class DeEntifyStrings
{
/**
* unicode nbsp control char, 160, 0x0a.
*/
@SuppressWarnings( { "WeakerAccess" } )
public static final char UNICODE_NBSP_160_0x0a = 160;
/**
* Longest an HTML4 entity can be, at least in our tables, including the lead & and trail ;.
* Note HTM4 longest entity is {@value #LONGEST_HTML4_ENTITY}.
*
* @noinspection WeakerAccess, JavadocReference, WeakerAccess
*/
public static final int LONGEST_HTML4_ENTITY = "ϑ".length();
/**
* Longest an HTML5 entity can be, at least in our tables, including the lead & and trail ;.
* Note HTML5 longest entity is {@value #LONGEST_HTML5_ENTITY},.
*
* @noinspection WeakerAccess, JavadocReference, WeakerAccess
*/
public static final int LONGEST_HTML5_ENTITY = "∳".length();
/**
* The shortest an entity can be {@value #SHORTEST_HTML4_ENTITY}, at least in our tables, including the lead & and
* trailing ;.
*
* @noinspection WeakerAccess, JavadocReference, WeakerAccess
*/
public static final int SHORTEST_HTML4_ENTITY = 4;/* < */
/**
* The shortest an entity can be {@value #SHORTEST_HTML5_ENTITY}, at least in our tables, including the lead & and
* trailing ;.
*
* @noinspection WeakerAccess, JavadocReference, WeakerAccess
*/
public static final int SHORTEST_HTML5_ENTITY = 4;/* < */
/**
* true to enable the testing code.
*/
private static final boolean DEBUGGING = false;
/**
* tags, that when removed should leave a space behind.
*/
private static final String[] spacingTags = { "tr", "td", "th", "p", "br", "dl", "dt", "li" };
/**
* allows lookup by entity name, to get the corresponding char.
* Loaded from two hard-coded generated arrays burning into this class.
* Does not deal with HTML5 entities.
*/
private static final HashMap entityToChar;
static
{
// build HashMap to look up entity name to get corresponding Unicode
// char number. Following code generated by Entities.
String[] entityKeys = {
// W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E
// generated by Entities. Insert from com\mindprod\entities\entitiesjustkeys.javafrag
"AElig" /* 198 : Æ Latin capital letter AE */,
"Aacute" /* 193 : Á Latin capital letter A with acute */,
"Acirc" /* 194 : Â Latin capital letter A with circumflex */,
"Agrave" /* 192 : À Latin capital letter A with grave */,
"Alpha" /* 913 : Α Greek capital letter Alpha */,
"Aring" /* 197 : Å Latin capital letter A with ring above */,
"Atilde" /* 195 : Ã Latin capital letter A with tilde */,
"Auml" /* 196 : Ä Latin capital letter A with diaeresis */,
"Beta" /* 914 : Β Greek capital letter Beta */,
"Ccedil" /* 199 : Ç Latin capital letter C with cedilla */,
"Chi" /* 935 : Χ Greek capital letter Chi */,
"Dagger" /* 8225 : ‡ double dagger */,
"Delta" /* 916 : Δ Greek capital letter Delta */,
"ETH" /* 208 : Ð Latin capital letter Eth */,
"Eacute" /* 201 : É Latin capital letter E with acute */,
"Ecirc" /* 202 : Ê Latin capital letter E with circumflex */,
"Egrave" /* 200 : È Latin capital letter E with grave */,
"Epsilon" /* 917 : Ε Greek capital letter Epsilon */,
"Eta" /* 919 : Η Greek capital letter Eta */,
"Euml" /* 203 : Ë Latin capital letter E with diaeresis */,
"Gamma" /* 915 : Γ Greek capital letter Gamma */,
"Iacute" /* 205 : Í Latin capital letter I with acute */,
"Icirc" /* 206 : Î Latin capital letter I with circumflex */,
"Igrave" /* 204 : Ì Latin capital letter I with grave */,
"Iota" /* 921 : Ι Greek capital letter Iota */,
"Iuml" /* 207 : Ï Latin capital letter I with diaeresis */,
"Kappa" /* 922 : Κ Greek capital letter Kappa */,
"Lambda" /* 923 : Λ Greek capital letter Lambda */,
"Mu" /* 924 : Μ Greek capital letter Mu */,
"Ntilde" /* 209 : Ñ Latin capital letter N with tilde */,
"Nu" /* 925 : Ν Greek capital letter Nu */,
"OElig" /* 338 : Œ Latin capital ligature oe */,
"Oacute" /* 211 : Ó Latin capital letter O with acute */,
"Ocirc" /* 212 : Ô Latin capital letter O with circumflex */,
"Ograve" /* 210 : Ò Latin capital letter O with grave */,
"Omega" /* 937 : Ω Greek capital letter Omega */,
"Omicron" /* 927 : Ο Greek capital letter Omicron */,
"Oslash" /* 216 : Ø Latin capital letter O with stroke */,
"Otilde" /* 213 : Õ Latin capital letter O with tilde */,
"Ouml" /* 214 : Ö Latin capital letter O with diaeresis */,
"Phi" /* 934 : Φ Greek capital letter Phi */,
"Pi" /* 928 : Π Greek capital letter Pi */,
"Prime" /* 8243 : ″ double prime */,
"Psi" /* 936 : Ψ Greek capital letter Psi */,
"Rho" /* 929 : Ρ Greek capital letter Rho */,
"Scaron" /* 352 : Š Latin capital letter S with caron */,
"Sigma" /* 931 : Σ Greek capital letter Sigma */,
"THORN" /* 222 : Þ Latin capital letter Thorn */,
"Tau" /* 932 : Τ Greek capital letter Tau */,
"Theta" /* 920 : Θ Greek capital letter Theta */,
"Uacute" /* 218 : Ú Latin capital letter U with acute */,
"Ucirc" /* 219 : Û Latin capital letter U with circumflex */,
"Ugrave" /* 217 : Ù Latin capital letter U with grave */,
"Upsilon" /* 933 : Υ Greek capital letter Upsilon */,
"Uuml" /* 220 : Ü Latin capital letter U with diaeresis */,
"Xi" /* 926 : Ξ Greek capital letter Xi */,
"Yacute" /* 221 : Ý Latin capital letter Y with acute */,
"Yuml" /* 376 : Ÿ Latin capital letter Y with diaeresis */,
"Zeta" /* 918 : Ζ Greek capital letter Zeta */,
"aacute" /* 225 : á Latin small letter a with acute */,
"acirc" /* 226 : â Latin small letter a with circumflex */,
"acute" /* 180 : ´ acute accent */,
"aelig" /* 230 : æ Latin lowercase ligature ae */,
"agrave" /* 224 : à Latin small letter a with grave */,
"alefsym" /* 8501 : ℵ alef symbol */,
"alpha" /* 945 : α Greek small letter alpha */,
"amp" /* 38 : & ampersand */,
"and" /* 8743 : ∧ logical and */,
"ang" /* 8736 : ∠ angle */,
"aring" /* 229 : å Latin small letter a with ring above */,
"asymp" /* 8776 : ≈ asymptotic to */,
"atilde" /* 227 : ã Latin small letter a with tilde */,
"auml" /* 228 : ä Latin small letter a with diaeresis */,
"bdquo" /* 8222 : „ double low-99 quotation mark */,
"beta" /* 946 : β Greek small letter beta */,
"brvbar" /* 166 : ¦ broken bar */,
"bull" /* 8226 : • bullet */,
"cap" /* 8745 : ∩ intersection */,
"ccedil" /* 231 : ç Latin small letter c with cedilla */,
"cedil" /* 184 : ¸ cedilla */,
"cent" /* 162 : ¢ cent sign */,
"chi" /* 967 : χ Greek small letter chi */,
"circ" /* 710 : ˆ modifier letter circumflex accent */,
"clubs" /* 9827 : ♣ black club suit */,
"cong" /* 8773 : ≅ congruent to */,
"copy" /* 169 : © copyright sign circled c */,
"crarr" /* 8629 : ↵ downwards arrow with corner leftwards */,
"cup" /* 8746 : ∪ union */,
"curren" /* 164 : ¤ currency sign */,
"dArr" /* 8659 : ⇓ downwards double arrow */,
"dagger" /* 8224 : † dagger */,
"darr" /* 8595 : ↓ downwards arrow */,
"deg" /* 176 : ° degree sign */,
"delta" /* 948 : δ Greek small letter delta */,
"diams" /* 9830 : ♦ black diamond suit */,
"divide" /* 247 : ÷ division sign */,
"eacute" /* 233 : é Latin small letter e with acute */,
"ecirc" /* 234 : ê Latin small letter e with circumflex */,
"egrave" /* 232 : è Latin small letter e with grave */,
"empty" /* 8709 : ∅ empty set */,
"emsp" /* 8195 : em space */,
"ensp" /* 8194 : en space */,
"epsilon" /* 949 : ε Greek small letter epsilon */,
"equiv" /* 8801 : ≡ identical to */,
"eta" /* 951 : η Greek small letter eta */,
"eth" /* 240 : ð Latin small letter eth */,
"euml" /* 235 : ë Latin small letter e with diaeresis */,
"euro" /* 8364 : € Euro currency sign */,
"exist" /* 8707 : ∃ there exists */,
"fnof" /* 402 : ƒ Latin small letter f with hook */,
"forall" /* 8704 : ∀ for all */,
"frac12" /* 189 : ½ vulgar fraction 1/2 */,
"frac14" /* 188 : ¼ vulgar fraction 1/4 */,
"frac34" /* 190 : ¾ vulgar fraction 3/4 */,
"frasl" /* 8260 : ⁄ fraction slash */,
"gamma" /* 947 : γ Greek small letter gamma */,
"ge" /* 8805 : ≥ greater-than or equal to */,
"gt" /* 62 : > greater-than sign */,
"hArr" /* 8660 : ⇔ left right double arrow */,
"harr" /* 8596 : ↔ left right arrow */,
"hearts" /* 9829 : ♥ black heart suit */,
"hellip" /* 8230 : … horizontal ellipsis */,
"iacute" /* 237 : í Latin small letter i with acute */,
"icirc" /* 238 : î Latin small letter i with circumflex */,
"iexcl" /* 161 : ¡ inverted exclamation mark */,
"igrave" /* 236 : ì Latin small letter i with grave */,
"image" /* 8465 : ℑ black-letter capital i */,
"infin" /* 8734 : ∞ infinity */,
"int" /* 8747 : ∫ integral */,
"iota" /* 953 : ι Greek small letter iota */,
"iquest" /* 191 : ¿ inverted question mark */,
"isin" /* 8712 : ∈ element of */,
"iuml" /* 239 : ï Latin small letter i with diaeresis */,
"kappa" /* 954 : κ Greek small letter kappa */,
"lArr" /* 8656 : ⇐ leftwards double arrow */,
"lambda" /* 955 : λ Greek small letter lambda */,
"lang" /* 9001 : 〈 left-pointing angle bracket */,
"laquo" /* 171 : « left guillemot */,
"larr" /* 8592 : ← leftwards arrow */,
"lceil" /* 8968 : ⌈ left ceiling */,
"ldquo" /* 8220 : “ left double-66 quotation mark */,
"le" /* 8804 : ≤ less-than or equal to */,
"lfloor" /* 8970 : ⌊ left floor */,
"lowast" /* 8727 : ∗ asterisk operator */,
"loz" /* 9674 : ◊ open lozenge */,
"lrm" /* 8206 : left-to-right mark */,
"lsaquo" /* 8249 : ‹ single left-pointing angle quotation mark */,
"lsquo" /* 8216 : ‘ left single-6 quotation mark */,
"lt" /* 60 : < less-than sign */,
"macr" /* 175 : ¯ macron */,
"mdash" /* 8212 : — em dash */,
"micro" /* 181 : µ micro sign */,
"middot" /* 183 : · middle dot */,
"minus" /* 8722 : − minus sign */,
"mu" /* 956 : μ Greek small letter mu */,
"nabla" /* 8711 : ∇ nabla */,
"nbsp" /* 160 : non-breaking space */,
"ndash" /* 8211 : – en dash */,
"ne" /* 8800 : ≠ not equal to */,
"ni" /* 8715 : ∋ like backwards epsilon */,
"not" /* 172 : ¬ not sign */,
"notin" /* 8713 : ∉ not an element of */,
"nsub" /* 8836 : ⊄ not a subset of */,
"ntilde" /* 241 : ñ Latin small letter n with tilde */,
"nu" /* 957 : ν Greek small letter nu */,
"oacute" /* 243 : ó Latin small letter o with acute */,
"ocirc" /* 244 : ô Latin small letter o with circumflex */,
"oelig" /* 339 : œ Latin small ligature oe */,
"ograve" /* 242 : ò Latin small letter o with grave */,
"oline" /* 8254 : ‾ overline */,
"omega" /* 969 : ω Greek small letter omega */,
"omicron" /* 959 : ο Greek small letter omicron */,
"oplus" /* 8853 : ⊕ circled plus */,
"or" /* 8744 : ∨ vee */,
"ordf" /* 170 : ª feminine ordinal indicator */,
"ordm" /* 186 : º masculine ordinal indicator */,
"oslash" /* 248 : ø Latin small letter o with stroke */,
"otilde" /* 245 : õ Latin small letter o with tilde */,
"otimes" /* 8855 : ⊗ circled times */,
"ouml" /* 246 : ö Latin small letter o with diaeresis */,
"para" /* 182 : ¶ pilcrow sign */,
"part" /* 8706 : ∂ partial differential */,
"permil" /* 8240 : ‰ per mille sign */,
"perp" /* 8869 : ⊥ up tack */,
"phi" /* 966 : φ Greek small letter phi */,
"pi" /* 960 : π Greek small letter pi */,
"piv" /* 982 : ϖ Greek pi symbol */,
"plusmn" /* 177 : ± plus-minus sign */,
"pound" /* 163 : £ pound sign */,
"prime" /* 8242 : ′ prime */,
"prod" /* 8719 : ∏ n-ary product */,
"prop" /* 8733 : ∝ proportional to */,
"psi" /* 968 : ψ Greek small letter psi */,
"quot" /* 34 : " quotation mark */,
"rArr" /* 8658 : ⇒ rightwards double arrow */,
"radic" /* 8730 : √ square root */,
"rang" /* 9002 : 〉 right-pointing angle bracket */,
"raquo" /* 187 : » right guillemot */,
"rarr" /* 8594 : → rightwards arrow */,
"rceil" /* 8969 : ⌉ right ceiling */,
"rdquo" /* 8221 : ” right double-99 quotation mark */,
"real" /* 8476 : ℜ black-letter capital r */,
"reg" /* 174 : ® registered sign. circled R. */,
"rfloor" /* 8971 : ⌋ right floor */,
"rho" /* 961 : ρ Greek small letter rho */,
"rlm" /* 8207 : right-to-left mark */,
"rsaquo" /* 8250 : › single right-pointing angle quotation mark */,
"rsquo" /* 8217 : ’ right single-9 quotation mark */,
"sbquo" /* 8218 : ‚ single low-9 quotation mark */,
"scaron" /* 353 : š Latin small letter s with caron */,
"sdot" /* 8901 : ⋅ dot operator */,
"sect" /* 167 : § section sign */,
"shy" /* 173 : soft hyphen */,
"sigma" /* 963 : σ Greek small letter sigma */,
"sigmaf" /* 962 : ς Greek small letter final sigma */,
"sim" /* 8764 : ∼ tilde operator */,
"spades" /* 9824 : ♠ black spade suit */,
"sub" /* 8834 : ⊂ subset of */,
"sube" /* 8838 : ⊆ subset of or equal to */,
"sum" /* 8721 : ∑ n-ary summation */,
"sup1" /* 185 : ¹ superscript one */,
"sup2" /* 178 : ² superscript two */,
"sup3" /* 179 : ³ superscript three */,
"sup" /* 8835 : ⊃ superset of */,
"supe" /* 8839 : ⊇ superset of or equal to */,
"szlig" /* 223 : ß Latin small letter sharp s */,
"tau" /* 964 : τ Greek small letter tau */,
"there4" /* 8756 : ∴ therefore three dots */,
"theta" /* 952 : θ Greek small letter theta */,
"thetasym" /* 977 : ϑ Greek theta symbol */,
"thinsp" /* 8201 : thin space */,
"thorn" /* 254 : þ Latin small letter thorn */,
"tilde" /* 732 : ˜ small tilde */,
"times" /* 215 : × multiplication sign */,
"trade" /* 8482 : ™ trademark sign */,
"uArr" /* 8657 : ⇑ upwards double arrow */,
"uacute" /* 250 : ú Latin small letter u with acute */,
"uarr" /* 8593 : ↑ upwards arrow */,
"ucirc" /* 251 : û Latin small letter u with circumflex */,
"ugrave" /* 249 : ù Latin small letter u with grave */,
"uml" /* 168 : ¨ diaeresis */,
"upsih" /* 978 : ϒ Greek upsilon with hook symbol */,
"upsilon" /* 965 : υ Greek small letter upsilon */,
"uuml" /* 252 : ü Latin small letter u with diaeresis */,
"weierp" /* 8472 : ℘ script capital p */,
"xi" /* 958 : ξ Greek small letter xi */,
"yacute" /* 253 : ý Latin small letter y with acute */,
"yen" /* 165 : ¥ yen sign */,
"yuml" /* 255 : ÿ Latin small letter y with diaeresis */,
"zeta" /* 950 : ζ Greek small letter zeta */,
"zwj" /* 8205 : zero width joiner */,
"zwnj" /* 8204 : zero width non-joiner */,
};
char[] entityValues = {
// W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E
// generated by Entities. Insert from com\mindprod\entities\entitiesjustkeys.javafrag
198 /* Æ : Æ Latin capital letter AE */,
193 /* Á : Á Latin capital letter A with acute */,
194 /* Â : Â Latin capital letter A with circumflex */,
192 /* À : À Latin capital letter A with grave */,
913 /* Α : Α Greek capital letter Alpha */,
197 /* Å : Å Latin capital letter A with ring above */,
195 /* Ã : Ã Latin capital letter A with tilde */,
196 /* Ä : Ä Latin capital letter A with diaeresis */,
914 /* Β : Β Greek capital letter Beta */,
199 /* Ç : Ç Latin capital letter C with cedilla */,
935 /* Χ : Χ Greek capital letter Chi */,
8225 /* ‡ : ‡ double dagger */,
916 /* Δ : Δ Greek capital letter Delta */,
208 /* Ð : Ð Latin capital letter Eth */,
201 /* É : É Latin capital letter E with acute */,
202 /* Ê : Ê Latin capital letter E with circumflex */,
200 /* È : È Latin capital letter E with grave */,
917 /* Ε : Ε Greek capital letter Epsilon */,
919 /* Η : Η Greek capital letter Eta */,
203 /* Ë : Ë Latin capital letter E with diaeresis */,
915 /* Γ : Γ Greek capital letter Gamma */,
205 /* Í : Í Latin capital letter I with acute */,
206 /* Î : Î Latin capital letter I with circumflex */,
204 /* Ì : Ì Latin capital letter I with grave */,
921 /* Ι : Ι Greek capital letter Iota */,
207 /* Ï : Ï Latin capital letter I with diaeresis */,
922 /* Κ : Κ Greek capital letter Kappa */,
923 /* Λ : Λ Greek capital letter Lambda */,
924 /* Μ : Μ Greek capital letter Mu */,
209 /* Ñ : Ñ Latin capital letter N with tilde */,
925 /* Ν : Ν Greek capital letter Nu */,
338 /* Œ : Œ Latin capital ligature oe */,
211 /* Ó : Ó Latin capital letter O with acute */,
212 /* Ô : Ô Latin capital letter O with circumflex */,
210 /* Ò : Ò Latin capital letter O with grave */,
937 /* Ω : Ω Greek capital letter Omega */,
927 /* Ο : Ο Greek capital letter Omicron */,
216 /* Ø : Ø Latin capital letter O with stroke */,
213 /* Õ : Õ Latin capital letter O with tilde */,
214 /* Ö : Ö Latin capital letter O with diaeresis */,
934 /* Φ : Φ Greek capital letter Phi */,
928 /* Π : Π Greek capital letter Pi */,
8243 /* ″ : ″ double prime */,
936 /* Ψ : Ψ Greek capital letter Psi */,
929 /* Ρ : Ρ Greek capital letter Rho */,
352 /* Š : Š Latin capital letter S with caron */,
931 /* Σ : Σ Greek capital letter Sigma */,
222 /* Þ : Þ Latin capital letter Thorn */,
932 /* Τ : Τ Greek capital letter Tau */,
920 /* Θ : Θ Greek capital letter Theta */,
218 /* Ú : Ú Latin capital letter U with acute */,
219 /* Û : Û Latin capital letter U with circumflex */,
217 /* Ù : Ù Latin capital letter U with grave */,
933 /* Υ : Υ Greek capital letter Upsilon */,
220 /* Ü : Ü Latin capital letter U with diaeresis */,
926 /* Ξ : Ξ Greek capital letter Xi */,
221 /* Ý : Ý Latin capital letter Y with acute */,
376 /* Ÿ : Ÿ Latin capital letter Y with diaeresis */,
918 /* Ζ : Ζ Greek capital letter Zeta */,
225 /* á : á Latin small letter a with acute */,
226 /* â : â Latin small letter a with circumflex */,
180 /* ´ : ´ acute accent */,
230 /* æ : æ Latin lowercase ligature ae */,
224 /* à : à Latin small letter a with grave */,
8501 /* ℵ : ℵ alef symbol */,
945 /* α : α Greek small letter alpha */,
38 /* & : & ampersand */,
8743 /* ∧ : ∧ logical and */,
8736 /* ∠ : ∠ angle */,
229 /* å : å Latin small letter a with ring above */,
8776 /* ≈ : ≈ asymptotic to */,
227 /* ã : ã Latin small letter a with tilde */,
228 /* ä : ä Latin small letter a with diaeresis */,
8222 /* „ : „ double low-99 quotation mark */,
946 /* β : β Greek small letter beta */,
166 /* ¦ : ¦ broken bar */,
8226 /* • : • bullet */,
8745 /* ∩ : ∩ intersection */,
231 /* ç : ç Latin small letter c with cedilla */,
184 /* ¸ : ¸ cedilla */,
162 /* ¢ : ¢ cent sign */,
967 /* χ : χ Greek small letter chi */,
710 /* ˆ : ˆ modifier letter circumflex accent */,
9827 /* ♣ : ♣ black club suit */,
8773 /* ≅ : ≅ congruent to */,
169 /* © : © copyright sign circled c */,
8629 /* ↵ : ↵ downwards arrow with corner leftwards */,
8746 /* ∪ : ∪ union */,
164 /* ¤ : ¤ currency sign */,
8659 /* ⇓ : ⇓ downwards double arrow */,
8224 /* † : † dagger */,
8595 /* ↓ : ↓ downwards arrow */,
176 /* ° : ° degree sign */,
948 /* δ : δ Greek small letter delta */,
9830 /* ♦ : ♦ black diamond suit */,
247 /* ÷ : ÷ division sign */,
233 /* é : é Latin small letter e with acute */,
234 /* ê : ê Latin small letter e with circumflex */,
232 /* è : è Latin small letter e with grave */,
8709 /* ∅ : ∅ empty set */,
8195 /* : em space */,
8194 /* : en space */,
949 /* ε : ε Greek small letter epsilon */,
8801 /* ≡ : ≡ identical to */,
951 /* η : η Greek small letter eta */,
240 /* ð : ð Latin small letter eth */,
235 /* ë : ë Latin small letter e with diaeresis */,
8364 /* € : € Euro currency sign */,
8707 /* ∃ : ∃ there exists */,
402 /* ƒ : ƒ Latin small letter f with hook */,
8704 /* ∀ : ∀ for all */,
189 /* ½ : ½ vulgar fraction 1/2 */,
188 /* ¼ : ¼ vulgar fraction 1/4 */,
190 /* ¾ : ¾ vulgar fraction 3/4 */,
8260 /* ⁄ : ⁄ fraction slash */,
947 /* γ : γ Greek small letter gamma */,
8805 /* ≥ : ≥ greater-than or equal to */,
62 /* > : > greater-than sign */,
8660 /* ⇔ : ⇔ left right double arrow */,
8596 /* ↔ : ↔ left right arrow */,
9829 /* ♥ : ♥ black heart suit */,
8230 /* … : … horizontal ellipsis */,
237 /* í : í Latin small letter i with acute */,
238 /* î : î Latin small letter i with circumflex */,
161 /* ¡ : ¡ inverted exclamation mark */,
236 /* ì : ì Latin small letter i with grave */,
8465 /* ℑ : ℑ black-letter capital i */,
8734 /* ∞ : ∞ infinity */,
8747 /* ∫ : ∫ integral */,
953 /* ι : ι Greek small letter iota */,
191 /* ¿ : ¿ inverted question mark */,
8712 /* ∈ : ∈ element of */,
239 /* ï : ï Latin small letter i with diaeresis */,
954 /* κ : κ Greek small letter kappa */,
8656 /* ⇐ : ⇐ leftwards double arrow */,
955 /* λ : λ Greek small letter lambda */,
9001 /* 〈 : 〈 left-pointing angle bracket */,
171 /* « : « left guillemot */,
8592 /* ← : ← leftwards arrow */,
8968 /* ⌈ : ⌈ left ceiling */,
8220 /* “ : “ left double-66 quotation mark */,
8804 /* ≤ : ≤ less-than or equal to */,
8970 /* ⌊ : ⌊ left floor */,
8727 /* ∗ : ∗ asterisk operator */,
9674 /* ◊ : ◊ open lozenge */,
8206 /* : left-to-right mark */,
8249 /* ‹ : ‹ single left-pointing angle quotation mark */,
8216 /* ‘ : ‘ left single-6 quotation mark */,
60 /* < : < less-than sign */,
175 /* ¯ : ¯ macron */,
8212 /* — : — em dash */,
181 /* µ : µ micro sign */,
183 /* · : · middle dot */,
8722 /* − : − minus sign */,
956 /* μ : μ Greek small letter mu */,
8711 /* ∇ : ∇ nabla */,
160 /* : non-breaking space */,
8211 /* – : – en dash */,
8800 /* ≠ : ≠ not equal to */,
8715 /* ∋ : ∋ like backwards epsilon */,
172 /* ¬ : ¬ not sign */,
8713 /* ∉ : ∉ not an element of */,
8836 /* ⊄ : ⊄ not a subset of */,
241 /* ñ : ñ Latin small letter n with tilde */,
957 /* ν : ν Greek small letter nu */,
243 /* ó : ó Latin small letter o with acute */,
244 /* ô : ô Latin small letter o with circumflex */,
339 /* œ : œ Latin small ligature oe */,
242 /* ò : ò Latin small letter o with grave */,
8254 /* ‾ : ‾ overline */,
969 /* ω : ω Greek small letter omega */,
959 /* ο : ο Greek small letter omicron */,
8853 /* ⊕ : ⊕ circled plus */,
8744 /* ∨ : ∨ vee */,
170 /* ª : ª feminine ordinal indicator */,
186 /* º : º masculine ordinal indicator */,
248 /* ø : ø Latin small letter o with stroke */,
245 /* õ : õ Latin small letter o with tilde */,
8855 /* ⊗ : ⊗ circled times */,
246 /* ö : ö Latin small letter o with diaeresis */,
182 /* ¶ : ¶ pilcrow sign */,
8706 /* ∂ : ∂ partial differential */,
8240 /* ‰ : ‰ per mille sign */,
8869 /* ⊥ : ⊥ up tack */,
966 /* φ : φ Greek small letter phi */,
960 /* π : π Greek small letter pi */,
982 /* ϖ : ϖ Greek pi symbol */,
177 /* ± : ± plus-minus sign */,
163 /* £ : £ pound sign */,
8242 /* ′ : ′ prime */,
8719 /* ∏ : ∏ n-ary product */,
8733 /* ∝ : ∝ proportional to */,
968 /* ψ : ψ Greek small letter psi */,
34 /* " : " quotation mark */,
8658 /* ⇒ : ⇒ rightwards double arrow */,
8730 /* √ : √ square root */,
9002 /* 〉 : 〉 right-pointing angle bracket */,
187 /* » : » right guillemot */,
8594 /* → : → rightwards arrow */,
8969 /* ⌉ : ⌉ right ceiling */,
8221 /* ” : ” right double-99 quotation mark */,
8476 /* ℜ : ℜ black-letter capital r */,
174 /* ® : ® registered sign. circled R. */,
8971 /* ⌋ : ⌋ right floor */,
961 /* ρ : ρ Greek small letter rho */,
8207 /* : right-to-left mark */,
8250 /* › : › single right-pointing angle quotation mark */,
8217 /* ’ : ’ right single-9 quotation mark */,
8218 /* ‚ : ‚ single low-9 quotation mark */,
353 /* š : š Latin small letter s with caron */,
8901 /* ⋅ : ⋅ dot operator */,
167 /* § : § section sign */,
173 /* : soft hyphen */,
963 /* σ : σ Greek small letter sigma */,
962 /* ς : ς Greek small letter final sigma */,
8764 /* ∼ : ∼ tilde operator */,
9824 /* ♠ : ♠ black spade suit */,
8834 /* ⊂ : ⊂ subset of */,
8838 /* ⊆ : ⊆ subset of or equal to */,
8721 /* ∑ : ∑ n-ary summation */,
185 /* ¹ : ¹ superscript one */,
178 /* ² : ² superscript two */,
179 /* ³ : ³ superscript three */,
8835 /* ⊃ : ⊃ superset of */,
8839 /* ⊇ : ⊇ superset of or equal to */,
223 /* ß : ß Latin small letter sharp s */,
964 /* τ : τ Greek small letter tau */,
8756 /* ∴ : ∴ therefore three dots */,
952 /* θ : θ Greek small letter theta */,
977 /* ϑ : ϑ Greek theta symbol */,
8201 /* : thin space */,
254 /* þ : þ Latin small letter thorn */,
732 /* ˜ : ˜ small tilde */,
215 /* × : × multiplication sign */,
8482 /* ™ : ™ trademark sign */,
8657 /* ⇑ : ⇑ upwards double arrow */,
250 /* ú : ú Latin small letter u with acute */,
8593 /* ↑ : ↑ upwards arrow */,
251 /* û : û Latin small letter u with circumflex */,
249 /* ù : ù Latin small letter u with grave */,
168 /* ¨ : ¨ diaeresis */,
978 /* ϒ : ϒ Greek upsilon with hook symbol */,
965 /* υ : υ Greek small letter upsilon */,
252 /* ü : ü Latin small letter u with diaeresis */,
8472 /* ℘ : ℘ script capital p */,
958 /* ξ : ξ Greek small letter xi */,
253 /* ý : ý Latin small letter y with acute */,
165 /* ¥ : ¥ yen sign */,
255 /* ÿ : ÿ Latin small letter y with diaeresis */,
950 /* ζ : ζ Greek small letter zeta */,
8205 /* : zero width joiner */,
8204 /* : zero width non-joiner */,
};
// allow 50% extra space for faster lookup.
entityToChar = new HashMap<>( entityKeys.length * 150 / 100 );
for ( int i = 0; i < entityKeys.length; i++ )
{
// leave out nbsp so it can be specially handled if entity not found.
if ( !entityKeys[ i ].equals( "nbsp" ) )
{
entityToChar.put( entityKeys[ i ], entityValues[ i ] );
}
// add also ' for strip but not insert. optional for XML, not used in HTML.
entityToChar.put( "apos", ( char ) 39 );
}
} // end static
/**
* Checks a number of gauntlet conditions to ensure this is a valid entity. Converts Entity to corresponding char.
* Does not deal with HTML5 entities.
*
* @param possBareEntityWithSemicolon string that may hold an entity. Lead & must be stripped,
* but may optionally contain text past the ;
* @param translateNbspTo char you would like nbsp translated to, usually ' ' or (char) 160 .
*
* @return corresponding unicode character, or 0 if the entity is invalid.
* @noinspection WeakerAccess
*/
protected static char possBareHTMLEntityWithSemicolonToChar( String possBareEntityWithSemicolon,
char translateNbspTo )
{
if ( possBareEntityWithSemicolon.length() < SHORTEST_HTML4_ENTITY - 1 )
{
return 0;
}
// find the trailing ;
int whereSemi = possBareEntityWithSemicolon
.indexOf( ';', SHORTEST_HTML4_ENTITY - 2/* where start looking */ );
if ( whereSemi < SHORTEST_HTML4_ENTITY - 2 )
{
return 0;
}
return bareHTMLEntityToChar( possBareEntityWithSemicolon.substring( 0, whereSemi ), translateNbspTo );
}
/**
* Prepares tags for removal, to ensure they are replaced by a space
* |
|
---|
| --> _ ' ' )
{
// insert space before <
sb.append( ' ' );
}
break;
}
}
}
sb.append( c );
prevChar = c;
}
return sb.toString();
}
/**
* remove all text between <applet.. </applet>, <style... </style> <script... </script>
*
* @param s HTML string to strip tag pairs out of.
*
* @return string with tag pairs stripped out.
*/
private static String stripHTMLTagPairs( String s )
{
String[] tags =
{ "applet", "APPLET", "style", "STYLE", "script", "SCRIPT" };
for ( final String tag : tags )
{
final String beginTag = "<" + tag;
final String endTag = "" + tag + ">";
int begin = 0;
while ( begin < s.length()
&& ( begin = s.indexOf( beginTag, begin ) ) >= 0 )
{
final int end;
if ( ( end = s.indexOf( endTag, begin + beginTag.length() ) )
> 0 )
{
// chop out the