package com.mindprod.jprep; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import com.mindprod.jdisplay.CharLiteral; import com.mindprod.jdisplay.CommentJavaDoc; import com.mindprod.jdisplay.CommentSlashSlash; import com.mindprod.jdisplay.CommentSlashStar; import com.mindprod.jdisplay.Fence; import com.mindprod.jdisplay.NL; import com.mindprod.jdisplay.NumericLiteral; import com.mindprod.jdisplay.Operator; import com.mindprod.jdisplay.SQLKeyword; import com.mindprod.jdisplay.SQLVar; import com.mindprod.jdisplay.Space; import com.mindprod.jdisplay.Start; import com.mindprod.jdisplay.Stop; import com.mindprod.jdisplay.Token; /** * Breaks a SQL program or program fragment into rough tokens. * Token.java controls the colours and fonts. * * @author Roedy Green * @version 1.1 * @since 2004-06-13 * version 1.0 2004-06-13 based on JavaTokenizer. */ public class SQLTokenizer { /** * Constructor * * @param program String representing program, newlines represented by * \n only */ public SQLTokenizer( String program ) { this.program = program; parse(); } /** * the program or program fragment we are parsing */ private String program; /** * Finite state automaton changes state as we process characters. */ private State state; /** * how far we are through parsing the program */ private int charIndex; /** * how many chars in the entire program we are parsing. */ private int size; /** * true if state dealt with this char, e.g. added it to a string buffer. * false if it postponed processing of it to the next state. */ boolean handled; /** * in white space including at end of line. */ private final State IN_WHITESPACE = new InWhitespace(); /** * in name, keyword or identifier */ private final State IN_NAME = new InName(); /** * in operator, or string of operators */ private final State IN_OPERATOR = new InOperator(); /** * in { } [] () or string of such */ private final State IN_FENCE = new InFence(); /** * seen inside slash star ... star slash or slash star star t... star slash */ private final State IN_REMSLASHSTAR = new InRemSlashStar(); /** * seen inside // ... eol */ private final State IN_ONE_LINE_REM = new InOneLineRem(); /** * seen ' */ private final State IN_TICK = new InTick(); // possible categories of character /** * End of line character */ private static final int EOL = 0; /** * bracketting character e.g. ()[]{} */ private static final int FENCE = 1; /** * high ascii ` and chars not used in Java */ private static final int OTHER = 2; /** * chars used in indentifiers */ private static final int PLAIN = 3; /** * punctuation */ private static final int PUNCTUATION = 4; /** * / */ private static final int SLASH = 5; /** * space */ private static final int SPACE = 6; /** * star */ private static final int STAR = 7; /** * ' */ private static final int TICK = 8; /** * - */ private static final int DASH = 9; /** * ignore control chars */ private static final int IGNORE = 10; /** * list of tokens we have parsed out. */ ArrayList tokens = new ArrayList(); /** * buffer to accumulate name. */ StringBuilder name; /** * count of how many new lines encountered */ int nlCount; /** * count of how many spaces encountered. */ int spaceCount; /** * buffer to accumulate the string of operators. */ StringBuilder operators; /** * buffer to accumulate the string of bracketting fence characters. */ StringBuilder fences; /** * buffer to accumulate the material inside the quotes */ StringBuilder quotation; /** * buffer to accumulate a // or slash star remart */ StringBuilder comment; /** * complete list of SQL keywors, aka reserved words */ static private HashSet keywords = new HashSet( Arrays.asList( new String[]{ "ABS", "ACOS", "ADD", "ADMIN", "AFTER", "ALIAS", "ALL", "ALTER", "ANALYZE", "AND", "ANY", "AS", "ASC", "ASCII", "ASENSITIVE", "ASIN", "ATAN", "ATAN2", "AUTO_INCREMENT", "AUTOCOMMIT", "AUTOINCREMENT", "AVA", "AVG", "BACKUP", "BDB", "BEFORE", "BERKELEYDB", "BETWEEN", "BIGINT", "BINARY", "BINLOG", "BIT", "BITAND", "BITOR", "BLOB", "BOTH", "BY", "CACHE", "CACHED", "CALL", "CASCADE", "CASE", "CASEWHEN", "CAST", "CEILING", "CHANGE", "CHAR", "CHARACTER", "CHECK", "CHECKPOINT", "CHECKSUM", "CLASS", "COLLATE", "COLLATION", "COLUMN", "COLUMNS", "COMMIT", "COMPACT", "CONCAT", "CONDITION", "CONNECT", "CONNECTION", "CONSTRAINT", "CONTINUE", "CONVERT", "COS", "COT", "COUNT", "COUNTER", "CREATE", "CROSS", "CURDATE", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURSOR", "CURTIME", "DATA", "DATABASE", "DATABASES", "DATE", "DATETIME", "DAY_HOUR", "DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DAYNAME", "DAYOFMONTH", "DAYOFWEEK", "DAYOFYEAR", "DEC", "DECIMAL", "DECLARE", "DEFAULT", "DEGREES", "DELAYED", "DELETE", "DESC", "DESCRIBE", "DETERMINISTIC", "DIFFERENCE", "DISALLOW", "DISCONNECT", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ENGINES", "ERRORS", "ESCAPE", "ESCAPED", "EVENTS", "EXIST", "EXISTS", "EXIT", "EXP", "EXPLAIN", "FALSE", "FETCH", "FIELDS", "FIRST", "FLOAT", "FLOOR", "FLUSH", "FOR", "FORCE", "FOREIGN", "FOUND", "FRAC_SECOND", "FROM", "FULLTEXT", "GENERAL", "GLOBAL", "GRANT", "GRANTS", "GREATEST", "GROUP", "GUEST", "GUID", "HAVING", "HIGH_PRIORITY", "HOSTS", "HOUR", "HOUR_MICROSECOND", "HOUR_MINUTE", "HOUR_SECOND", "HSQLDB", "IDENTITY", "IF", "IFNULL", "IGNORE", "IGNORECASE", "IMMEDIATELY", "IMP", "IN", "INDEX", "INFILE", "INNER", "INNODB", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT", "INTERVAL", "INTO", "IO_THREAD", "IS", "ITERATE", "JDBC", "JOIN", "JVM", "KEY", "KEYS", "KILL", "LAST_INSERT_ID", "LCASE", "LEADING", "LEAST", "LEAVE", "LEFT", "LENGTH", "LEVEL", "LIKE", "LIMIT", "LINES", "LOAD", "LOCALTIME", "LOCALTIMESTAMP", "LOCATE", "LOCK", "LOG", "LOG10", "LOGS", "LOGSIZE", "LONG", "LONGBLOB", "LONGTEXT", "LONGVARBINARY", "LONGVARCHAR", "LOOP", "LOW_PRIORITY", "LOWER", "LTRIM", "MASTER_SERVER_ID", "MATCH", "MAX", "MEDIUMBLOB", "MEDIUMINT", "MEDIUMTEXT", "MEMO", "MEMORY", "MIDDLEINT", "MIN", "MINUTE", "MINUTE_MICROSECOND", "MINUTE_SECOND", "MOD", "MONTH", "MONTHNAME", "NATURAL", "NO_WRITE_TO_BINLOG", "NOT", "NOTE", "NOW", "NOWAIT", "NULL", "NUMERIC", "OBJECT", "OLEOBJECT", "ON", "OPTIMIZE", "OPTION", "OPTIONALLY", "OR", "ORDER", "OTHER", "OUT", "OUTER", "OUTFILE", "OWNERACCESS", "PARAMETERS", "PASSWORD", "PI", "PIVOT", "POWER", "PRECISION", "PRIMARY", "PRIVILEGES", "PROCEDURE", "PROCESSLIST", "PUBLIC", "PURGE", "QUARTER", "QUEUE", "RADIANS", "RAND", "READ", "READONLY", "REAL", "REFERENCES", "REFERENTIAL_INTEGRITY", "REFINT", "REGEXP", "RENAME", "REPAIR", "REPEAT", "REPLACE", "REQUIRE", "RESET", "RESTORE", "RESTRICT", "RETURN", "REVOKE", "RIGHT", "RLIKE", "ROLLBACK", "ROUND", "ROUNDMAGIC", "ROW", "RTRIM", "SCRIPT", "SECOND", "SECOND_MICROSECOND", "SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SHUTDOWN", "SIGN", "SIN", "SLAVE", "SMALLINT", "SOME", "SONAME", "SOUNDEX", "SPACE", "SPATIAL", "SPECIFIC", "SQL", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", "SQL_SMALL_RESULT", "SQL_TSI_DAY", "SQL_TSI_FRAC_SECOND", "SQL_TSI_HOUR", "SQL_TSI_MINUTE", "SQL_TSI_MONTH", "SQL_TSI_QUARTER", "SQL_TSI_SECOND", "SQL_TSI_WEEK", "SQL_TSI_YEAR", "SQLEXCEPTION", "SQLSTATE", "SQLWARNING", "SQRT", "SSL", "START", "STARTING", "STATUS", "STDEV", "STDEVP", "STOP", "STRAIGHT_JOIN", "STRIPED", "SUBSTRING", "SUM", "TABLE", "TABLEID", "TABLES", "TAN", "TEMP", "TERMINATED", "THEN", "TIME", "TIMESTAMP", "TIMESTAMPADD", "TIMESTAMPDIFF", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", "TOP", "TRAILING", "TRANSFORM", "TRIGGER", "TRUE", "TRUNCATE", "TS", "UCASE", "UNDO", "UNION", "UNIQUE", "UNLOCK", "UNSIGNED", "UPDATE", "UPPER", "UPPERCASE", "URL", "USAGE", "USE", "USER", "USER_RESOURCES", "USING", "UTC_DATE", "UTC_TIME", "UTC_TIMESTAMP", "VALUE", "VALUES", "VAR", "VARBINARY", "VARCHAR", "VARCHAR_IGNORECASE", "VARCHARACTER", "VARIABLES", "VARP", "VARYING", "WARNINGS", "WEEK", "WHEN", "WHERE", "WHILE", "WITH", "WORK", "WRITE", "WRITE_DELAY", "XOR", "YEAR", "YEAR_MONTH", "ZEROFILL", })); /** * list of java keywords used to define primitive * variables. */ static private HashSet types = new HashSet( Arrays.asList( new String[]{ "BIGINT", "BINARY", "BIT", "BLOB", "BOOLEAN", "BYTE", "CHAR", "CHARACTER", "CLOB", "CURRENCY", "DATE", "DOUBLE", "FLOAT", "FLOAT4", "FLOAT8", "INT", "INTEGER", "INTEGER1", "INTEGER2", "INTEGER4", "LOGICAL", "LONG", "LONGBINARY", "LONGTEXT", "MONEY", "NUMBER", "NUMERIC", "PERCENT", "REAL", "SHORT", "SINGLE", "SMALLINT", "STRING", "TEXT", "TIME", "TIMESTAMP", "VARBINARY", "VARCHAR", "YESNO", })); /** * Get the results of the parsing, a array of Tokens * * @return array of Tokens */ public Token[] getTokens() { int size = tokens.size(); return(Token[])tokens.toArray( new Token[ size ] ); } /** * Parse progam and leave a list of Tokens in * tokens ArrayList. */ void parse () { size = program.length(); state = IN_WHITESPACE; handled = true; // charIndex is an instance variable. for ( charIndex=0; charIndex= size ) { return false; } return program.substring( charIndex, charIndex + s.length() ).equalsIgnoreCase( s ); } /** * skip ahead this many chars in the stream. * * @param jumpOver How many chars to skip ahead. */ void skip( int jumpOver ) { charIndex += jumpOver; } /** * Categorise one character * * @param theChar character to categorise * @return category code, e.g. PLAIN TICK */ private static int categorise ( char theChar ) { if ( 'a' <= theChar && theChar <= 'z' ) return PLAIN; if ( 'A' <= theChar && theChar <= 'Z' ) return PLAIN; if ( '0' <= theChar && theChar <= '9' ) return PLAIN; switch ( theChar ) { case ' ': case '\t': case 0xa0: //   return SPACE; case '\n': return EOL; case '\r': case 127: return IGNORE; case '-': return DASH; case '$': case '_': return PLAIN; case '*': return STAR; case '/': return SLASH; case '\'' : return TICK; case '(': case ')': case '[': case ']': case '{': case '}': return FENCE; case '!': case '#': // ok in comments case '%': case '&': case '+': case ',': case '.': case ':': case ';': case '<': case '=': case '>': case '?': case '@': // ok in comments case '\"': // quote case '\\': // backslash case '^': case '`': // ok in comments case '|': case '~': return PUNCTUATION; default: if ( 0 <= theChar && theChar <= 31 ) { return IGNORE; } else if ( 128 <= theChar ) { // treat high bit on as ordinary alpha return PLAIN; } else { return OTHER; } } // end switch } // end categorise /** * get rid of leading and trailing NLs */ private void trimNLs() { // remove leading NLs. while ( tokens.size() > 0 && ( tokens.get( 0 ) instanceof NL ) ) { tokens.remove( 0 ); } // remove trailing NLs int size; while ( (size = tokens.size()) > 0 && ( tokens.get( size-1 ) instanceof NL ) ) { tokens.remove( size-1 ); } // put one at the end tokens.add( new NL() ); tokens.add( 0, new Start( "
" ) );
      tokens.add( new Stop( "
" ) ); } // end trimNLs /** * Basic class upon the states of our finite state * automaton are based. Inner class to SQLTokenizer. * * @author Roedy Green * @version 1.0 * @since 2004-05-02 */ abstract class State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { /* default way to recognise next state */ handled = false; switch ( category ) { case OTHER: case PLAIN: return IN_NAME; case DASH: if ( coming ( "--" ) ) { return IN_ONE_LINE_REM; } else { return IN_OPERATOR; } case PUNCTUATION: return IN_OPERATOR; case FENCE: return IN_FENCE; case SLASH: if ( coming( "//" ) ) { return IN_ONE_LINE_REM; } else if ( coming ( "/*" ) ) { return IN_REMSLASHSTAR; } else { return IN_OPERATOR; } case STAR: return IN_OPERATOR; case TICK: return IN_TICK; case EOL: case SPACE: return IN_WHITESPACE; default: throw new IllegalStateException ( "bad state: " + state + " " + category + " " + nextChar ); } } // end next } // end State /** * in white space */ class InWhitespace extends State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { switch ( category ) { case DASH: case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case STAR: case TICK: default: // strip trailing spaces, // no more than 3 NLs in a row. // collapse multiple NLs into a single token. // collapse mulitple spaces into a single token if ( nlCount > 3 ) { nlCount = 3; } if ( nlCount > 0 ) tokens.add( new NL ( nlCount ) ); if ( spaceCount > 0 ) { tokens.add( new Space( spaceCount ) ); } nlCount = 0; spaceCount = 0; handled = false; return super.next( category, nextChar ); case SPACE: // stay in spaces spaceCount ++; handled = true; return IN_WHITESPACE; case EOL: // stay in space nlCount++; spaceCount = 0; handled = true; return IN_WHITESPACE; } // end switch } } // end InWhiteSpace /** * in name, keyword or identifier */ class InName extends State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { if ( SQLTokenizer.this.name == null ) { SQLTokenizer.this.name = new StringBuilder( 50 ); } switch ( category ) { case DASH: case EOL: case FENCE: case OTHER: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: default: String name = SQLTokenizer.this.name.toString(); SQLTokenizer.this.name = null; if ( name.length() > 0 ) { // make a first stab at what sort of identifier it is. if ( keywords.contains( name.toUpperCase() ) ) { // keyword tokens.add( new SQLKeyword( name ) ); } else if ( types.contains( name.toUpperCase() ) ) { // type tokens.add( new SQLKeyword( name ) ); } else if ( Character.isDigit( name.charAt( 0 ) ) ) { // number tokens.add( new NumericLiteral( name ) ); } else { // could be var, function tokens.add( new SQLVar( name ) ); } } handled = false; return super.next( category, nextChar ); case PLAIN: // stay SQLTokenizer.this.name.append( nextChar ); handled = true; return IN_NAME; } // end switch } // end next } // end InName /** * in operator, or string of operators */ class InOperator extends State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { if ( SQLTokenizer.this.operators == null ) { SQLTokenizer.this.operators = new StringBuilder( 10 ); } switch ( category ) { case EOL: case FENCE: case OTHER: case PLAIN: case SPACE: case TICK: default: String operators = SQLTokenizer.this.operators.toString(); SQLTokenizer.this.operators = null; if ( operators.length() != 0 ) { // treat ++ as two distinct operators. // operators if ( operators.length() > 1 && ( operators.indexOf( ';' ) >= 0 || operators.indexOf( '.' ) >=0 || operators.indexOf( ',' ) >=0 ) ) { // split up into separate tokens for detailed analysis if contain ; . or , // this will also split ++ into + + for ( int i=0; i= 0 ) { addComment( comments.substring( 0, place ), javaDoc ); tokens.add( new NL( ) ); comments = comments.substring( place + 1 ); } if ( comments.length() > 0 ) { addComment( comments, javaDoc ); } /* we are out of the comment, treat as if had been reading white space */ handled = true; return IN_WHITESPACE; } else { // just an ordinary *, not comment terminator SQLTokenizer.this.comment.append( '*' ); handled = true; return IN_REMSLASHSTAR; } } // end switch } } // end InRemSlashStar /** * add an appropriate Comment token * * @param comment comment to make into a token * @param javaDoc true if make JavaDoc comment * false for star comment */ private void addComment ( String comment, boolean javaDoc ) { if ( comment.length() == 0 ) { return; } if ( javaDoc ) { tokens.add( new CommentJavaDoc( comment ) ); } else { tokens.add( new CommentSlashStar( comment ) ); } } /** * seen slash slash, or -- in one line comment. */ class InOneLineRem extends State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { if ( SQLTokenizer.this.comment == null ) { SQLTokenizer.this.comment = new StringBuilder( 80 ); if ( nextChar == '/' ) { SQLTokenizer.this.comment.append( "//" ); } else if ( nextChar == '-' ) { SQLTokenizer.this.comment.append( "--" ); } skip(1); handled = true; return IN_ONE_LINE_REM; } switch ( category ) { case DASH: case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: default: SQLTokenizer.this.comment.append( nextChar ); handled = true; return IN_ONE_LINE_REM; case EOL: String comment = SQLTokenizer.this.comment.toString(); SQLTokenizer.this.comment = null; tokens.add ( new CommentSlashSlash( comment ) ); handled = false; return IN_WHITESPACE; } // end switch } } // end InRemSlashSlash /** * seen ' */ class InTick extends State { /** * determines the next state based on current state, and next char * * @param category category of next character * @param nextChar next character to process * @return next State */ public State next ( int category, char nextChar ) { if ( SQLTokenizer.this.quotation == null ) { SQLTokenizer.this.quotation = new StringBuilder( 8 ); // don't insert ' in buffer handled = true; return IN_TICK; } String quotation; switch ( category ) { case DASH: case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: default: SQLTokenizer.this.quotation.append( nextChar ); handled = true; return IN_TICK; case EOL: // treat eol like missing ' // done. quotation = SQLTokenizer.this.quotation.toString(); SQLTokenizer.this.quotation = null; tokens.add ( new CharLiteral( quotation ) ); handled = false; return IN_WHITESPACE; case TICK: // done. // don't append trailing ' to buffer. quotation = SQLTokenizer.this.quotation.toString(); SQLTokenizer.this.quotation = null; tokens.add ( new CharLiteral( quotation ) ); handled = true; return IN_WHITESPACE; } // end switch } } // end InTick /*** * calculate depth of {} () and [] */ private void calcNestingDepths() { int braceDepth = 0; int parenDepth = 0; int bracketDepth = 0; int size = tokens.size(); for ( int i=0; i