/*
* [EntifyStrings.java]
*
* Summary: Inserts HTML entities such as " into a String replacing the single character equivalents.
*
* Copyright: (c) 2005-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 2.6 2009-04-05 StripEntities now leaves a space behind when it removes a
etc tag.
* 2.7 2009-11-14 generate a table for the HTML cheat sheet of quote-like entities.
* 2.8 2009-12-22 export table on HTML 5 entities. Now import csv file rather than embed entity facts.
* 2.9 2010-01-29 export XHTML entities (currently same as HTML-4 entities).
* 3.0 2011-01-05 remove deprecated methods. add toHTMLEntity, toXMLEntity, hide charToHTMLEntity, charToXMLEntity
* 3.1 2011-02-10 rename methods to htmlEntify and xmlEntify
*/
package com.mindprod.entities;
/**
* Inserts HTML entities such as " into a String replacing the single character equivalents.
*
* Entities are coded inline in a giant case.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.1 2011-02-10 rename methods to entifyHTML and entifyXML
* @see DeEntify
* @see DeEntifyStrings
* @see Entify
* @see EntifyStrings
* @see Flatten
* @since 2005
*/
public class EntifyStrings
{
/**
* /** should we generate hex entities in preference to decimal ones?
*/
private static boolean preferHexEntities = true;
/**
* convert a single char to its equivalent HTML entity. Ordinary chars are not changed. 160 -> weird chars
* -> nnn; form
*
* @param c Char to convert
*
* @return equivalent string e.g. &, null means leave char as is.
*/
private static String charToHTMLEntity( char c )
{
switch ( c )
{
default:
if ( c < 127 )
{
// leave alone as equivalent string.
return null;
// faster than String.valueOf( c ).intern();
}
else
{
if ( preferHexEntities )
{
// default, compact, no lead 0
return "" + Integer.toHexString( c ) + ";";
}
else
{
//use the decimal nnn; form, compact, no lead 0.
return "" + Integer.toString( c ) + ";";
}
}
// end default case
// W A R N I N G ! _ M A N U A L L Y _ I N S E R T E D _ G E N E R A T E D _ C O D E
// this code generated by Entities, include from com\mindprod\entities\entitiescase.javafrag
// Could be more efficiently handled with a lookup table[9831]
// manually sorted numerically.
case 34:
return """ /* " quotation mark */;
case 38:
return "&" /* & ampersand */;
case 60:
return "<" /* < less-than sign */;
case 62:
return ">" /* > greater-than sign */;
case 160:
return " " /* non-breaking space */;
case 161:
return "¡" /* ¡ inverted exclamation mark */;
case 162:
return "¢" /* ¢ cent sign */;
case 163:
return "£" /* £ pound sign */;
case 164:
return "¤" /* ¤ currency sign */;
case 165:
return "¥" /* ¥ yen sign */;
case 166:
return "¦" /* ¦ broken bar */;
case 167:
return "§" /* § section sign */;
case 168:
return "¨" /* ¨ diaeresis */;
case 169:
return "©" /* © copyright sign circled c */;
case 170:
return "ª" /* ª feminine ordinal indicator */;
case 171:
return "«" /* « left guillemot */;
case 172:
return "¬" /* ¬ not sign */;
case 173:
return "" /* soft hyphen */;
case 174:
return "®" /* ® registered sign. circled R. */;
case 175:
return "¯" /* ¯ macron */;
case 176:
return "°" /* ° degree sign */;
case 177:
return "±" /* ± plus-minus sign */;
case 178:
return "²" /* ² superscript two */;
case 179:
return "³" /* ³ superscript three */;
case 180:
return "´" /* ´ acute accent */;
case 181:
return "µ" /* µ micro sign */;
case 182:
return "¶" /* ¶ pilcrow sign */;
case 183:
return "·" /* · middle dot */;
case 184:
return "¸" /* ¸ cedilla */;
case 185:
return "¹" /* ¹ superscript one */;
case 186:
return "º" /* º masculine ordinal indicator */;
case 187:
return "»" /* » right guillemot */;
case 188:
return "¼" /* ¼ vulgar fraction 1/4 */;
case 189:
return "½" /* ½ vulgar fraction 1/2 */;
case 190:
return "¾" /* ¾ vulgar fraction 3/4 */;
case 191:
return "¿" /* ¿ inverted question mark */;
case 192:
return "À" /* À Latin capital letter A with grave */;
case 193:
return "Á" /* Á Latin capital letter A with acute */;
case 194:
return "Â" /* Â Latin capital letter A with circumflex */;
case 195:
return "Ã" /* Ã Latin capital letter A with tilde */;
case 196:
return "Ä" /* Ä Latin capital letter A with diaeresis */;
case 197:
return "Å" /* Å Latin capital letter A with ring above */;
case 198:
return "Æ" /* Æ Latin capital letter AE */;
case 199:
return "Ç" /* Ç Latin capital letter C with cedilla */;
case 200:
return "È" /* È Latin capital letter E with grave */;
case 201:
return "É" /* É Latin capital letter E with acute */;
case 202:
return "Ê" /* Ê Latin capital letter E with circumflex */;
case 203:
return "Ë" /* Ë Latin capital letter E with diaeresis */;
case 204:
return "Ì" /* Ì Latin capital letter I with grave */;
case 205:
return "Í" /* Í Latin capital letter I with acute */;
case 206:
return "Î" /* Î Latin capital letter I with circumflex */;
case 207:
return "Ï" /* Ï Latin capital letter I with diaeresis */;
case 208:
return "Ð" /* Ð Latin capital letter Eth */;
case 209:
return "Ñ" /* Ñ Latin capital letter N with tilde */;
case 210:
return "Ò" /* Ò Latin capital letter O with grave */;
case 211:
return "Ó" /* Ó Latin capital letter O with acute */;
case 212:
return "Ô" /* Ô Latin capital letter O with circumflex */;
case 213:
return "Õ" /* Õ Latin capital letter O with tilde */;
case 214:
return "Ö" /* Ö Latin capital letter O with diaeresis */;
case 215:
return "×" /* × multiplication sign */;
case 216:
return "Ø" /* Ø Latin capital letter O with stroke */;
case 217:
return "Ù" /* Ù Latin capital letter U with grave */;
case 218:
return "Ú" /* Ú Latin capital letter U with acute */;
case 219:
return "Û" /* Û Latin capital letter U with circumflex */;
case 220:
return "Ü" /* Ü Latin capital letter U with diaeresis */;
case 221:
return "Ý" /* Ý Latin capital letter Y with acute */;
case 222:
return "Þ" /* Þ Latin capital letter Thorn */;
case 223:
return "ß" /* ß Latin small letter sharp s */;
case 224:
return "à" /* à Latin small letter a with grave */;
case 225:
return "á" /* á Latin small letter a with acute */;
case 226:
return "â" /* â Latin small letter a with circumflex */;
case 227:
return "ã" /* ã Latin small letter a with tilde */;
case 228:
return "ä" /* ä Latin small letter a with diaeresis */;
case 229:
return "å" /* å Latin small letter a with ring above */;
case 230:
return "æ" /* æ Latin lowercase ligature ae */;
case 231:
return "ç" /* ç Latin small letter c with cedilla */;
case 232:
return "è" /* è Latin small letter e with grave */;
case 233:
return "é" /* é Latin small letter e with acute */;
case 234:
return "ê" /* ê Latin small letter e with circumflex */;
case 235:
return "ë" /* ë Latin small letter e with diaeresis */;
case 236:
return "ì" /* ì Latin small letter i with grave */;
case 237:
return "í" /* í Latin small letter i with acute */;
case 238:
return "î" /* î Latin small letter i with circumflex */;
case 239:
return "ï" /* ï Latin small letter i with diaeresis */;
case 240:
return "ð" /* ð Latin small letter eth */;
case 241:
return "ñ" /* ñ Latin small letter n with tilde */;
case 242:
return "ò" /* ò Latin small letter o with grave */;
case 243:
return "ó" /* ó Latin small letter o with acute */;
case 244:
return "ô" /* ô Latin small letter o with circumflex */;
case 245:
return "õ" /* õ Latin small letter o with tilde */;
case 246:
return "ö" /* ö Latin small letter o with diaeresis */;
case 247:
return "÷" /* ÷ division sign */;
case 248:
return "ø" /* ø Latin small letter o with stroke */;
case 249:
return "ù" /* ù Latin small letter u with grave */;
case 250:
return "ú" /* ú Latin small letter u with acute */;
case 251:
return "û" /* û Latin small letter u with circumflex */;
case 252:
return "ü" /* ü Latin small letter u with diaeresis */;
case 253:
return "ý" /* ý Latin small letter y with acute */;
case 254:
return "þ" /* þ Latin small letter thorn */;
case 255:
return "ÿ" /* ÿ Latin small letter y with diaeresis */;
case 338:
return "Œ" /* Œ Latin capital ligature oe */;
case 339:
return "œ" /* œ Latin small ligature oe */;
case 352:
return "Š" /* Š Latin capital letter S with caron */;
case 353:
return "š" /* š Latin small letter s with caron */;
case 376:
return "Ÿ" /* Ÿ Latin capital letter Y with diaeresis */;
case 402:
return "ƒ" /* ƒ Latin small letter f with hook */;
case 710:
return "ˆ" /* ˆ modifier letter circumflex accent */;
case 732:
return "˜" /* ˜ small tilde */;
case 913:
return "Α" /* Α Greek capital letter Alpha */;
case 914:
return "Β" /* Β Greek capital letter Beta */;
case 915:
return "Γ" /* Γ Greek capital letter Gamma */;
case 916:
return "Δ" /* Δ Greek capital letter Delta */;
case 917:
return "Ε" /* Ε Greek capital letter Epsilon */;
case 918:
return "Ζ" /* Ζ Greek capital letter Zeta */;
case 919:
return "Η" /* Η Greek capital letter Eta */;
case 920:
return "Θ" /* Θ Greek capital letter Theta */;
case 921:
return "Ι" /* Ι Greek capital letter Iota */;
case 922:
return "Κ" /* Κ Greek capital letter Kappa */;
case 923:
return "Λ" /* Λ Greek capital letter Lambda */;
case 924:
return "Μ" /* Μ Greek capital letter Mu */;
case 925:
return "Ν" /* Ν Greek capital letter Nu */;
case 926:
return "Ξ" /* Ξ Greek capital letter Xi */;
case 927:
return "Ο" /* Ο Greek capital letter Omicron */;
case 928:
return "Π" /* Π Greek capital letter Pi */;
case 929:
return "Ρ" /* Ρ Greek capital letter Rho */;
case 931:
return "Σ" /* Σ Greek capital letter Sigma */;
case 932:
return "Τ" /* Τ Greek capital letter Tau */;
case 933:
return "Υ" /* Υ Greek capital letter Upsilon */;
case 934:
return "Φ" /* Φ Greek capital letter Phi */;
case 935:
return "Χ" /* Χ Greek capital letter Chi */;
case 936:
return "Ψ" /* Ψ Greek capital letter Psi */;
case 937:
return "Ω" /* Ω Greek capital letter Omega */;
case 945:
return "α" /* α Greek small letter alpha */;
case 946:
return "β" /* β Greek small letter beta */;
case 947:
return "γ" /* γ Greek small letter gamma */;
case 948:
return "δ" /* δ Greek small letter delta */;
case 949:
return "ε" /* ε Greek small letter epsilon */;
case 950:
return "ζ" /* ζ Greek small letter zeta */;
case 951:
return "η" /* η Greek small letter eta */;
case 952:
return "θ" /* θ Greek small letter theta */;
case 953:
return "ι" /* ι Greek small letter iota */;
case 954:
return "κ" /* κ Greek small letter kappa */;
case 955:
return "λ" /* λ Greek small letter lambda */;
case 956:
return "μ" /* μ Greek small letter mu */;
case 957:
return "ν" /* ν Greek small letter nu */;
case 958:
return "ξ" /* ξ Greek small letter xi */;
case 959:
return "ο" /* ο Greek small letter omicron */;
case 960:
return "π" /* π Greek small letter pi */;
case 961:
return "ρ" /* ρ Greek small letter rho */;
case 962:
return "ς" /* ς Greek small letter final sigma */;
case 963:
return "σ" /* σ Greek small letter sigma */;
case 964:
return "τ" /* τ Greek small letter tau */;
case 965:
return "υ" /* υ Greek small letter upsilon */;
case 966:
return "φ" /* φ Greek small letter phi */;
case 967:
return "χ" /* χ Greek small letter chi */;
case 968:
return "ψ" /* ψ Greek small letter psi */;
case 969:
return "ω" /* ω Greek small letter omega */;
case 977:
return "ϑ" /* ϑ Greek theta symbol */;
case 978:
return "ϒ" /* ϒ Greek upsilon with hook symbol */;
case 982:
return "ϖ" /* ϖ Greek pi symbol */;
case 8194:
return " " /* en space */;
case 8195:
return " " /* em space */;
case 8201:
return " " /* thin space */;
case 8204:
return "" /* zero width non-joiner */;
case 8205:
return "" /* zero width joiner */;
case 8206:
return "" /* left-to-right mark */;
case 8207:
return "" /* right-to-left mark */;
case 8211:
return "–" /* – en dash */;
case 8212:
return "—" /* — em dash */;
case 8216:
return "‘" /* ‘ left single-6 quotation mark */;
case 8217:
return "’" /* ’ right single-9 quotation mark */;
case 8218:
return "‚" /* ‚ single low-9 quotation mark */;
case 8220:
return "“" /* “ left double-66 quotation mark */;
case 8221:
return "”" /* ” right double-99 quotation mark */;
case 8222:
return "„" /* „ double low-99 quotation mark */;
case 8224:
return "†" /* † dagger */;
case 8225:
return "‡" /* ‡ double dagger */;
case 8226:
return "•" /* • bullet */;
case 8230:
return "…" /* … horizontal ellipsis */;
case 8240:
return "‰" /* ‰ per mille sign */;
case 8242:
return "′" /* ′ prime */;
case 8243:
return "″" /* ″ double prime */;
case 8249:
return "‹" /* ‹ single left-pointing angle quotation mark */;
case 8250:
return "›" /* › single right-pointing angle quotation mark */;
case 8254:
return "‾" /* ‾ overline */;
case 8260:
return "⁄" /* ⁄ fraction slash */;
case 8364:
return "€" /* € Euro currency sign */;
case 8465:
return "ℑ" /* ℑ black-letter capital i */;
case 8472:
return "℘" /* ℘ script capital p */;
case 8476:
return "ℜ" /* ℜ black-letter capital r */;
case 8482:
return "™" /* ™ trademark sign */;
case 8501:
return "ℵ" /* ℵ alef symbol */;
case 8592:
return "←" /* ← leftwards arrow */;
case 8593:
return "↑" /* ↑ upwards arrow */;
case 8594:
return "→" /* → rightwards arrow */;
case 8595:
return "↓" /* ↓ downwards arrow */;
case 8596:
return "↔" /* ↔ left right arrow */;
case 8629:
return "↵" /* ↵ downwards arrow with corner leftwards */;
case 8656:
return "⇐" /* ⇐ leftwards double arrow */;
case 8657:
return "⇑" /* ⇑ upwards double arrow */;
case 8658:
return "⇒" /* ⇒ rightwards double arrow */;
case 8659:
return "⇓" /* ⇓ downwards double arrow */;
case 8660:
return "⇔" /* ⇔ left right double arrow */;
case 8704:
return "∀" /* ∀ for all */;
case 8706:
return "∂" /* ∂ partial differential */;
case 8707:
return "∃" /* ∃ there exists */;
case 8709:
return "∅" /* ∅ empty set */;
case 8711:
return "∇" /* ∇ nabla */;
case 8712:
return "∈" /* ∈ element of */;
case 8713:
return "∉" /* ∉ not an element of */;
case 8715:
return "∋" /* ∋ like backwards epsilon */;
case 8719:
return "∏" /* ∏ n-ary product */;
case 8721:
return "∑" /* ∑ n-ary summation */;
case 8722:
return "−" /* − minus sign */;
case 8727:
return "∗" /* ∗ asterisk operator */;
case 8730:
return "√" /* √ square root */;
case 8733:
return "∝" /* ∝ proportional to */;
case 8734:
return "∞" /* ∞ infinity */;
case 8736:
return "∠" /* ∠ angle */;
case 8743:
return "∧" /* ∧ logical and */;
case 8744:
return "∨" /* ∨ vee */;
case 8745:
return "∩" /* ∩ intersection */;
case 8746:
return "∪" /* ∪ union */;
case 8747:
return "∫" /* ∫ integral */;
case 8756:
return "∴" /* ∴ therefore three dots */;
case 8764:
return "∼" /* ∼ tilde operator */;
case 8773:
return "≅" /* ≅ congruent to */;
case 8776:
return "≈" /* ≈ asymptotic to */;
case 8800:
return "≠" /* ≠ not equal to */;
case 8801:
return "≡" /* ≡ identical to */;
case 8804:
return "≤" /* ≤ less-than or equal to */;
case 8805:
return "≥" /* ≥ greater-than or equal to */;
case 8834:
return "⊂" /* ⊂ subset of */;
case 8835:
return "⊃" /* ⊃ superset of */;
case 8836:
return "⊄" /* ⊄ not a subset of */;
case 8838:
return "⊆" /* ⊆ subset of or equal to */;
case 8839:
return "⊇" /* ⊇ superset of or equal to */;
case 8853:
return "⊕" /* ⊕ circled plus */;
case 8855:
return "⊗" /* ⊗ circled times */;
case 8869:
return "⊥" /* ⊥ up tack */;
case 8901:
return "⋅" /* ⋅ dot operator */;
case 8968:
return "⌈" /* ⌈ left ceiling */;
case 8969:
return "⌉" /* ⌉ right ceiling */;
case 8970:
return "⌊" /* ⌊ left floor */;
case 8971:
return "⌋" /* ⌋ right floor */;
case 9001:
return "〈" /* 〈 left-pointing angle bracket */;
case 9002:
return "〉" /* 〉 right-pointing angle bracket */;
case 9674:
return "◊" /* ◊ open lozenge */;
case 9824:
return "♠" /* ♠ black spade suit */;
case 9827:
return "♣" /* ♣ black club suit */;
case 9829:
return "♥" /* ♥ black heart suit */;
case 9830:
return "♦" /* ♦ black diamond suit */;
} // end switch
// can't fall out bottom
} // end charToEntity
/**
* convert a single char to its equivalent XML entity. Ordinary chars are not changed. 160 -> weird chars
* -> nnn; form
*
* @param c Char to convert
*
* @return equivalent string e.g. &, null means leave char as is. Does not return unmodified letters.
*/
private static String charToXMLEntity
(
char c )
{
switch ( c )
{
default:
if ( c < 127 )
{
// leave alone as equivalent string.
return null;
// faster than String.valueOf( c ).intern();
}
else
{
// use the nnn; form
return "" + Integer.toString( c ) + ";";
}
// do NOT modify the following code. It is not generated.
case 34:
return """/* " quotation mark */;
case 38:
return "&"/* & ampersand */;
// case 39: // don't use apos, to make more compatible with HTML
// return "'"/* ' apos */
case 60:
return "<"/* < less-than sign */;
case 62:
return ">"/* > greater-than sign */;
case 160:
return " ";
} // end switch
// can't fall out bottom
} // end charToXMLEntity
/**
* Converts text to HTML by quoting dangerous characters. Text must not already contain entities. e.g. " ==> "
* < ==> < ordinary text passes unchanged. Does not convert space to
*
* @param text raw text to be processed. Must not be null.
* @param xmlStyle true if insert basic four XML enities, otherwise use full HTML-4 set.
*
* @return translated text, or null if input is null.
* @noinspection WeakerAccess
*/
private static String entifyHTMLorXML( final String text, final boolean xmlStyle )
{
if ( text == null )
{
return null;
}
int originalTextLength = text.length();
// estimate text will grow by no more than 10%
StringBuilder sb = new StringBuilder( originalTextLength * 110 / 100 );
int charsToAppend = 0;
for ( int i = 0; i < originalTextLength; i++ )
{
char c = text.charAt( i );
String entity = xmlStyle ? charToXMLEntity( c ) : charToHTMLEntity( c );
if ( entity == null )
{
// we could sb.append( c ), but that would be slower
// than saving them up for a big append.
charsToAppend++;
}
else
{
if ( charsToAppend != 0 )
{
sb.append( text.substring( i - charsToAppend, i ) );
charsToAppend = 0;
}
sb.append( entity );
}
} // end for
// append chars to the right of the last entity.
if ( charsToAppend != 0 )
{
sb.append( text.substring( originalTextLength - charsToAppend,
originalTextLength ) );
}
// if result is not longer, we did not do anything. Save RAM by returning the original
return ( sb.length() == originalTextLength ) ? text : sb.toString();
}
/**
* Converts text to HTML by quoting dangerous characters. Text must not already contain entities. e.g. " ==> "
* < ==> < ordinary text passes unchanged. Does not convert space to
*
* @param text raw text to be processed. Must not be null.
*
* @return translated HTML text, or null if input is null.
* @noinspection WeakerAccess
*/
public static String entifyHTML( String text )
{
return entifyHTMLorXML( text, false );
}
/**
* Converts text to XML by quoting dangerous characters.
* Text must not already contain entities. e.g. " ==> "
* < ==> < ordinary text passes unchanged. Does not convert space to
*
* @param text raw text to be processed. Must not be null.
*
* @return translated XML text, or null if input is null.
* @noinspection WeakerAccess
*/
public static String entifyXML( String text )
{
return entifyHTMLorXML( text, true );
}
/**
* Can this character be included in an HTML document without entifying it?
*
* @param c character is question
*
* @return true if char is simple, false if char needs entity
*/
public static boolean isSimple( char c )
{
return ' ' <= c && c <= '~' && !( c == '\"' || c == '&' || c == '<' || c == '>' );
}
/**
* We should generate hex entities in preference to decimal ones.
*/
public static void preferDecimalEntities()
{
preferHexEntities = false;
}
/**
* We should generate decimal entities in preference to hex ones.
*/
public static void preferHexEntities()
{
preferHexEntities = true;
}
/**
* Converts text to HTML by quoting dangerous characters.;
* Does not convert space to
*
* @param c raw character.
*
* @return translated HTML text, eg. & -> & x -> x
* @noinspection WeakerAccess
*/
public static String toHTMLEntity( char c )
{
final String result = charToHTMLEntity( c );
return result == null ? String.valueOf( c ) : result;
}
/**
* Converts text to HTML by converting all characters to 香 form
*
* @param c raw character.
*
* @return translated HTML text, eg. space -->
* @noinspection WeakerAccess
*/
public static String toHexEntity( char c )
{
return "" + Integer.toHexString( c );
}
/**
* Converts text to XML by quoting dangerous characters.;
* Does not convert space to
*
* @param c raw character.
*
* @return translated HTML text, eg. < -> < x -> x
* @noinspection WeakerAccess
*/
public static String toXMLEntity( char c )
{
final String result = charToXMLEntity( c );
return result == null ? String.valueOf( c ) : result;
}
// /** test harness
// // * @param args not used
// */
// public static void main ( String[] args )
// {
//
// String s = "abc\"&xyz";
// out.println( entifyHTMLorXML( s , false )) ;
// }
}
|