/* * [HTMLCharCategory.java] * * Summary: top level enum to define the categories of character for the BatTokenizer. * * Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 3.1 2009-04-12 shorter style names, improved highlighting. */ package com.mindprod.jprep; /** * top level enum to define the categories of character for the BatTokenizer. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2009-04-12 shorter style names, improved highlighting. * @since 2004-05-15 */ @SuppressWarnings( { "EnumeratedConstantNamingConvention" } ) public enum HTMLCharCategory { /** * & because it starts an entity */ AMP, /** * - special because it marks end of a comment --> */ DASH, /** * > - marks end of a tag */ END_TAG, /** * End of line character */ EOL, /** * = separates attribute and value */ EQUALS, /** * ignore control chars */ IGNORE, /** * high ascii ` and chars not used in legit HTML */ OTHER, /** * chars used in ordinary text and tag names */ PLAIN, /** * " */ QUOTE, /** * chars used in ordinary text and tag names */ RIGHT_BRACKET, /** * ; semicolon because it ends an entity */ SEMICOLON, /** * space */ SPACE, /** * < */ START_TAG; /** * Categorise one character * * @param theChar character to categorise * * @return category code, e.g. PLAIN QUOTE */ static HTMLCharCategory categorise( char theChar ) { if ( 'a' <= theChar && theChar <= 'z' ) { return PLAIN; } if ( 'A' <= theChar && theChar <= 'Z' ) { return PLAIN; } if ( '0' <= theChar && theChar <= '9' ) { return PLAIN; } switch ( theChar ) { case '!': case '#': case '$': case '%': case '(': case ')': case '*': case '+': case ',': case '.': case '/': case ':':// colon has no special significance case '?': case '@': case '[': case '\'': case '\\':// backslash has so special significance case '^': case '_': case '`':// grave case '{': case '|': case '}': case '~':// tilde return PLAIN; case '&': return AMP; case '-': return DASH; case '\n': return EOL; case '>': return END_TAG; case '=': return EQUALS; case '\r': case 127: case 0xfeff: /* bom */ case 0xfffd: /* replaced bom */ return IGNORE; case '\"': return QUOTE; case ']': return RIGHT_BRACKET; case ';': return SEMICOLON; case ' ': case '\t': case 0xa0://   return SPACE; case '<': return START_TAG; default: if ( 0 <= theChar && theChar <= 31 ) { return IGNORE; } else { return OTHER; } } // end switch } // end categorise } // end HTMLtCharCategory