/*
* [HTMLCharCategory.java]
*
* Summary: top level enum to define the categories of character for the BatTokenizer.
*
* Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 3.1 2009-04-12 shorter style names, improved highlighting.
*/
package com.mindprod.jprep;
/**
* top level enum to define the categories of character for the BatTokenizer.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.1 2009-04-12 shorter style names, improved highlighting.
* @since 2004-05-15
*/
@SuppressWarnings( { "EnumeratedConstantNamingConvention" } )
public enum HTMLCharCategory
{
/**
* & because it starts an entity
*/
AMP,
/**
* - special because it marks end of a comment -->
*/
DASH,
/**
* > - marks end of a tag
*/
END_TAG,
/**
* End of line character
*/
EOL,
/**
* = separates attribute and value
*/
EQUALS,
/**
* ignore control chars
*/
IGNORE,
/**
* high ascii ` and chars not used in legit HTML
*/
OTHER,
/**
* chars used in ordinary text and tag names
*/
PLAIN,
/**
* "
*/
QUOTE,
/**
* chars used in ordinary text and tag names
*/
RIGHT_BRACKET,
/**
* ; semicolon because it ends an entity
*/
SEMICOLON,
/**
* space
*/
SPACE,
/**
* <
*/
START_TAG;
/**
* Categorise one character
*
* @param theChar character to categorise
*
* @return category code, e.g. PLAIN QUOTE
*/
static HTMLCharCategory categorise( char theChar )
{
if ( 'a' <= theChar && theChar <= 'z' )
{
return PLAIN;
}
if ( 'A' <= theChar && theChar <= 'Z' )
{
return PLAIN;
}
if ( '0' <= theChar && theChar <= '9' )
{
return PLAIN;
}
switch ( theChar )
{
case '!':
case '#':
case '$':
case '%':
case '(':
case ')':
case '*':
case '+':
case ',':
case '.':
case '/':
case ':':// colon has no special significance
case '?':
case '@':
case '[':
case '\'':
case '\\':// backslash has so special significance
case '^':
case '_':
case '`':// grave
case '{':
case '|':
case '}':
case '~':// tilde
return PLAIN;
case '&':
return AMP;
case '-':
return DASH;
case '\n':
return EOL;
case '>':
return END_TAG;
case '=':
return EQUALS;
case '\r':
case 127:
case 0xfeff: /* bom */
case 0xfffd: /* replaced bom */
return IGNORE;
case '\"':
return QUOTE;
case ']':
return RIGHT_BRACKET;
case ';':
return SEMICOLON;
case ' ':
case '\t':
case 0xa0://
return SPACE;
case '<':
return START_TAG;
default:
if ( 0 <= theChar && theChar <= 31 )
{
return IGNORE;
}
else
{
return OTHER;
}
} // end switch
} // end categorise
} // end HTMLtCharCategory