/* * [SQLState.java] * * Summary: parser for SQL for colourising. * * Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 3.1 2009-04-12 shorter style names, improved highlighting. */ /** * State machine for SQLTokenizer, borrows heavily from JavaTokenizer tokens. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2009-04-12 shorter style names, improved highlighting. * @since 2004-05-15 */ package com.mindprod.jprep; import com.mindprod.jtokens.CharLiteral; import com.mindprod.jtokens.Gibberish; import com.mindprod.jtokens.Keyword; import com.mindprod.jtokens.NL; import com.mindprod.jtokens.NumericLiteral; import com.mindprod.jtokens.Operator; import com.mindprod.jtokens.Semicolon; import com.mindprod.jtokens.Space; import com.mindprod.jtokens.Start; import com.mindprod.jtokens.Stop; import com.mindprod.jtokens.Token; import com.mindprod.jtokens.java.CommentJavadoc; import com.mindprod.jtokens.java.CommentSlashSlash; import com.mindprod.jtokens.java.CommentSlashStar; import com.mindprod.jtokens.java.Fence; import com.mindprod.jtokens.sql.SQLVar; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import static java.lang.System.*; /** * parser for SQL for colourising. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2009-04-12 shorter style names, improved highlighting. * @since 2004 */ @SuppressWarnings( { "NestedAssignment", "ValueOfIncrementOrDecrementUsed", "UnnecessaryContinue", "EnumeratedConstantNamingConvention" } ) public enum SQLState { /** * Someone has fowarded us an EOL. We deal with it and any subsequent EOLs. When we hit something interesting we * let the default next deal with it. */ @SuppressWarnings( { "WeakerAccess" } ) AT_END_OF_LINE { void consume( char c ) { nlCount++; // end consume AT_END_OF_LINE } void leaving() { // no more than 3 NLs in a row. if ( nlCount > 3 ) { nlCount = 3; } // collapse multiple NLs into a single token. if ( nlCount > 0 ) { // ignore trailing spaces spaceCount = 0; addToken( new NL( nlCount ) ); nlCount = 0; } // end leaving AT_END_OF_LINE } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: how = HowToProcess.FORWARD; return super.next( category, nextChar, first ); case EOL: how = HowToProcess.CONSUME; return AT_END_OF_LINE; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next AT_END_OF_LINE } // end of enum constant AT_END_OF_LINE }, /** * in the middle of processing a string of {}()[] chars each one gets it own token. */ @SuppressWarnings( { "WeakerAccess" } ) IN_FENCE { void consume( char c ) { accumulated.append( c ); // end consume IN_FENCE } void leaving() { String fences = accumulated.toString(); accumulated.setLength( 0 ); // split (( up into separate tokens so can be rendered different // sizes. for ( int i = 0; i < fences.length(); i++ ) { char fence = fences.charAt( i ); // depth not necessarily correct yet addToken( new Fence( fence, 0/* depth */ ) ); } // end leaving IN_FENCE } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case EOL: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: how = HowToProcess.FORWARD; return super.next( category, nextChar, first ); case FENCE: // stay how = HowToProcess.CONSUME; return IN_FENCE; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_FENCE } // end of enum constant IN_FENCE }, /** * in name, keyword or identifier. We are overly strict on what constitutes an identifier. */ @SuppressWarnings( { "WeakerAccess" } ) IN_NAME { void consume( char c ) { accumulated.append( c ); // end consume IN_NAME } void leaving() { String name = accumulated.toString(); accumulated.setLength( 0 ); if ( name.length() > 0 ) { // make a first stab at what sort of identifier it is. if ( SQL_KEYWORDS.contains( name.toUpperCase().trim() ) ) { // keyword tokens.add( new Keyword( name ) ); } else if ( SQL_DATA_TYPES.contains( name.toUpperCase().trim() ) ) { // type tokens.add( new Keyword( name ) ); } else if ( Character.isDigit( name.charAt( 0 ) ) ) { // number tokens.add( new NumericLiteral( name ) ); } else { // could be var, function tokens.add( new SQLVar( name ) ); } } // end leaving IN_NAME } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case EOL: case FENCE: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: how = HowToProcess.FORWARD; return super.next( category, nextChar, first ); case OTHER:/* allow unicode names */ case PLAIN: // stay how = HowToProcess.CONSUME; return IN_NAME; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_NAME } // end of enum constant IN_NAME }, /** * in string or arithmetic operators, including ;, but not fences {}() [] */ @SuppressWarnings( { "WeakerAccess" } ) IN_OPERATOR { void consume( char c ) { accumulated.append( c ); // end consume IN_TAG } void leaving() { String operators = accumulated.toString(); accumulated.setLength( 0 ); // treat ; specially. int place; while ( ( place = operators.indexOf( ';' ) ) >= 0 ) { // possibly empty String group = operators.substring( 0, place ); addToken( new Operator( group ) ); addToken( new Semicolon() ); operators = operators.substring( place + 1 ); } // end while // deal with whatever is left over in operators addToken( new Operator( operators ) ); // end leaving IN_TAG } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case EOL: case FENCE: case OTHER: case PLAIN: case SPACE: case TICK: how = HowToProcess.FORWARD; return super.next( category, nextChar, first ); case PUNCTUATION:// includes semicolon case SLASH: case STAR: // stay how = HowToProcess.CONSUME; return IN_OPERATOR; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_TAG } // end of enum constant IN_TAG }, /** * seen slash slash, in one line comment... eol Will be terminated by eol slash slash goes in the token */ @SuppressWarnings( { "WeakerAccess" } ) IN_REMSLASHSLASH { void consume( char c ) { accumulated.append( c ); // end consume IN_REM_SLASH_SLASH } void leaving() { // simpler than other comments. Can't contain embedded \nl String comment = accumulated.toString(); accumulated.setLength( 0 ); if ( comment.length() > 0 ) { addToken( new CommentSlashSlash( comment ) ); } // end leaving IN_REM_SLASH_SLASH } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: case TICK: how = HowToProcess.CONSUME; return IN_REMSLASHSLASH; case EOL: how = HowToProcess.FORWARD; return AT_END_OF_LINE; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_REM_SLASH_SLASH } // end of enum constant IN_REM_SLASH_SLASH }, /** * in slash star or slash star star. We don't leave this state on hitting EOL but just keep on trucking and deal * with the eol tokens later. Finally ended by star slash. We don't deal with running of the end of the program * unbalanced. That will show ups as an error with stuff left un tokenized. */ @SuppressWarnings( { "WeakerAccess" } ) IN_REMSLASHSTAR { /** * keep track of whether / was preceded by star to mark end of * comment. */ private boolean prevWasStar = false; void consume( char c ) { accumulated.append( c ); // end consume IN_REM_SLASH_STAR } void leaving() { // break into several tokens if contains \n String comments = accumulated.toString(); accumulated.setLength( 0 ); boolean javaDoc = comments.startsWith( "/**" ); int place; Token token; while ( ( place = comments.indexOf( '\n' ) ) >= 0 ) { String comment = comments.substring( 0, place ); if ( javaDoc ) { token = new CommentJavadoc( comment ); } else { token = new CommentSlashStar( comment ); } addToken( token ); addToken( new NL() ); comments = comments.substring( place + 1 ); } // end while // deal with whatever is left over in comments if ( comments.length() > 0 ) { if ( javaDoc ) { token = new CommentJavadoc( comments ); } else { token = new CommentSlashStar( comments ); } addToken( token ); } // end leaving IN_REM_SLASH_STAR } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case EOL: case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SPACE: case TICK: prevWasStar = false; how = HowToProcess.CONSUME; return IN_REMSLASHSTAR; case STAR: prevWasStar = true; how = HowToProcess.CONSUME; return IN_REMSLASHSTAR; case SLASH: if ( first ) { // leading slash at begin of slash star or slash star star prevWasStar = false; how = HowToProcess.CONSUME; return IN_REMSLASHSTAR; } else if ( prevWasStar ) { // hit star slash end marker of comment how = HowToProcess.CONSUME; return IN_WHITESPACE;// super would just send us back // here } else { // just incidental / how = HowToProcess.CONSUME; return IN_REMSLASHSTAR; } // break; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_REM_SLASH_STAR } // end of enum constant IN_REM_SLASH_STAR }, /** * SQL uses quite different string literal conventions from Java. Strings are surrounded in (') not ("). * Embedded (') are written ('') [two single quotes in a row] not ") not ( \') and embedded (") are left plain * as ("). We have just seen the starting or ending tick or we are inside ticks. */ @SuppressWarnings( { "WeakerAccess" } ) IN_TICKS { /** * true if 'x' are balanced, false if hit EOL too soon. */ private boolean balanced = false; void consume( char c ) { accumulated.append( c ); // end consume } void leaving() { String quotation = accumulated.toString(); accumulated.setLength( 0 ); if ( balanced ) { // add even if 0 length // surrounding ticks generated as needed. not part of quotation addToken( new CharLiteral( quotation ) ); } else { // document had unbalanced " ...", missing trailing ". // We treat not as literal, but as an error. addToken( new Gibberish( "\'" + quotation ) ); // note that we have handled this anomaly balanced = true; } // end leaving IN_TICKS } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case SPACE: case STAR: how = HowToProcess.CONSUME; return IN_TICKS; case TICK: if ( first ) { balanced = false; how = HowToProcess.DISCARD; return IN_TICKS; } else { // was the final one balanced = true; how = HowToProcess.DISCARD; return IN_WHITESPACE;// super would just send us back // here } // break; case EOL: // we had an unbalanced '\ then eol // balanced will be false how = HowToProcess.FORWARD; return AT_END_OF_LINE; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_TICKS } // end of enum constant IN_TICKS }, /** * in white space, but not EOL. */ @SuppressWarnings( { "WeakerAccess" } ) IN_WHITESPACE { void consume( char c ) { spaceCount++; // end consume IN_WHITESPACE } void leaving() { // collapse multiple spaces into a single token if ( spaceCount > 0 ) { addToken( new Space( spaceCount ) ); spaceCount = 0; } // end leaving IN_WHITESPACE } SQLState next( SQLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case EOL: case FENCE: case OTHER: case PLAIN: case PUNCTUATION: case SLASH: case STAR: case TICK: how = HowToProcess.FORWARD; return super.next( category, nextChar, first ); case SPACE: how = HowToProcess.CONSUME; return IN_WHITESPACE; default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next IN_WHITESPACE } // end of enum constant IN_WHITESPACE }; // ////////////////////////////////////////////////////////////////// // common to all enum constants /** * true if want extra debugging checks and output */ private static final boolean DEBUGGING = false; /** * list of tokens we have parsed out. */ private static final ArrayList tokens = new ArrayList<>( 50000 ); /** * list of java SQL_KEYWORDS used to define primitive variables. */ private static final HashSet SQL_DATA_TYPES = new HashSet<>( Arrays .asList( "BIGINT", "BINARY", "BIT", "BLOB", "BOOLEAN", "BYTE", "CHAR", "CHARACTER", "CLOB", "CURRENCY", "DATE", "DOUBLE", "FLOAT", "FLOAT4", "FLOAT8", "INT", "INTEGER", "INTEGER1", "INTEGER2", "INTEGER4", "LOGICAL", "LONG", "LONGBINARY", "LONGTEXT", "MONEY", "NUMBER", "NUMERIC", "PERCENT", "REAL", "SHORT", "SINGLE", "SMALLINT", "STRING", "TEXT", "TIME", "TIMESTAMP", "VARBINARY", "VARCHAR", "YESNO" ) ); // statics are shared common to all enum constants. // others are a separate field in each enum constant. /** * complete list of SQL keywords, aka reserved words */ private static final HashSet SQL_KEYWORDS = new HashSet<>( Arrays .asList( "ABS", "ACOS", "ADD", "ADMIN", "AFTER", "ALIAS", "ALL", "ALTER", "ANALYZE", "AND", "ANY", "AS", "ASC", "ASCII", "ASENSITIVE", "ASIN", "ATAN", "ATAN2", "AUTO_INCREMENT", "AUTOCOMMIT", "AUTOINCREMENT", "AVA", "AVG", "BACKUP", "BDB", "BEFORE", "BERKELEYDB", "BETWEEN", "BIGINT", "BINARY", "BINLOG", "BIT", "BITAND", "BITOR", "BLOB", "BOTH", "BY", "CACHE", "CACHED", "CALL", "CASCADE", "CASE", "CASEWHEN", "CAST", "CEILING", "CHANGE", "CHAR", "CHARACTER", "CHECK", "CHECKPOINT", "CHECKSUM", "CLASS", "COLLATE", "COLLATION", "COLUMN", "COLUMNS", "COMMIT", "COMPACT", "CONCAT", "CONDITION", "CONNECT", "CONNECTION", "CONSTRAINT", "CONTINUE", "CONVERT", "COS", "COT", "COUNT", "COUNTER", "CREATE", "CROSS", "CURDATE", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURSOR", "CURTIME", "DATA", "DATABASE", "DATABASES", "DATE", "DATETIME", "DAY_HOUR", "DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DAYNAME", "DAYOFMONTH", "DAYOFWEEK", "DAYOFYEAR", "DEC", "DECIMAL", "DECLARE", "DEFAULT", "DEGREES", "DELAYED", "DELETE", "DESC", "DESCRIBE", "DETERMINISTIC", "DIFFERENCE", "DISALLOW", "DISCONNECT", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ENGINES", "ERRORS", "ESCAPE", "ESCAPED", "EVENTS", "EXIST", "EXISTS", "EXIT", "EXP", "EXPLAIN", "FALSE", "FETCH", "FIELDS", "FIRST", "FLOAT", "FLOOR", "FLUSH", "FOR", "FORCE", "FOREIGN", "FOUND", "FRAC_SECOND", "FROM", "FULLTEXT", "GENERAL", "GLOBAL", "GRANT", "GRANTS", "GREATEST", "GROUP", "GUEST", "GUID", "HAVING", "HIGH_PRIORITY", "HOSTS", "HOUR", "HOUR_MICROSECOND", "HOUR_MINUTE", "HOUR_SECOND", "HSQLDB", "IDENTITY", "IF", "IFNULL", "IGNORE", "IGNORECASE", "IMMEDIATELY", "IMP", "IN", "INDEX", "INFILE", "INNER", "INNODB", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT", "INTERVAL", "INTO", "IO_THREAD", "IS", "ITERATE", "JDBC", "JOIN", "JVM", "KEY", "KEYS", "KILL", "LAST_INSERT_ID", "LCASE", "LEADING", "LEAST", "LEAVE", "LEFT", "LENGTH", "LEVEL", "LIKE", "LIMIT", "LINES", "LOAD", "LOCALTIME", "LOCALTIMESTAMP", "LOCATE", "LOCK", "LOG", "LOG10", "LOGS", "LOGSIZE", "LONG", "LONGBLOB", "LONGTEXT", "LONGVARBINARY", "LONGVARCHAR", "LOOP", "LOW_PRIORITY", "LOWER", "LTRIM", "MASTER_SERVER_ID", "MATCH", "MAX", "MEDIUMBLOB", "MEDIUMINT", "MEDIUMTEXT", "MEMO", "MEMORY", "MIDDLEINT", "MIN", "MINUTE", "MINUTE_MICROSECOND", "MINUTE_SECOND", "MOD", "MONTH", "MONTHNAME", "NATURAL", "NO_WRITE_TO_BINLOG", "NOT", "NOTE", "NOW", "NOWAIT", "NULL", "NUMERIC", "OBJECT", "OLEOBJECT", "ON", "OPTIMIZE", "OPTION", "OPTIONALLY", "OR", "ORDER", "OTHER", "OUT", "OUTER", "OUTFILE", "OWNERACCESS", "PARAMETERS", "PASSWORD", "PI", "PIVOT", "POWER", "PRECISION", "PRIMARY", "PRIVILEGES", "PROCEDURE", "PROCESSLIST", "PUBLIC", "PURGE", "QUARTER", "QUEUE", "RADIANS", "RAND", "READ", "READONLY", "REAL", "REFERENCES", "REFERENTIAL_INTEGRITY", "REFINT", "REGEXP", "RENAME", "REPAIR", "REPEAT", "REPLACE", "REQUIRE", "RESET", "RESTORE", "RESTRICT", "RETURN", "REVOKE", "RIGHT", "RLIKE", "ROLLBACK", "ROUND", "ROUNDMAGIC", "ROW", "RTRIM", "SCRIPT", "SECOND", "SECOND_MICROSECOND", "SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SHUTDOWN", "SIGN", "SIN", "SLAVE", "SMALLINT", "SOME", "SONAME", "SOUNDEX", "SPACE", "SPATIAL", "SPECIFIC", "SQL", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", "SQL_SMALL_RESULT", "SQL_TSI_DAY", "SQL_TSI_FRAC_SECOND", "SQL_TSI_HOUR", "SQL_TSI_MINUTE", "SQL_TSI_MONTH", "SQL_TSI_QUARTER", "SQL_TSI_SECOND", "SQL_TSI_WEEK", "SQL_TSI_YEAR", "SQLEXCEPTION", "SQLSTATE", "SQLWARNING", "SQRT", "SSL", "START", "STARTING", "STATUS", "STDEV", "STDEVP", "STOP", "STRAIGHT_JOIN", "STRIPED", "SUBSTRING", "SUM", "TABLE", "TABLEID", "TABLES", "TAN", "TEMP", "TERMINATED", "THEN", "TIME", "TIMESTAMP", "TIMESTAMPADD", "TIMESTAMPDIFF", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", "TOP", "TRAILING", "TRANSFORM", "TRIGGER", "TRUE", "TRUNCATE", "TS", "UCASE", "UNDO", "UNION", "UNIQUE", "UNLOCK", "UNSIGNED", "UPDATE", "UPPER", "UPPERCASE", "URL", "USAGE", "USE", "USER", "USER_RESOURCES", "USING", "UTC_DATE", "UTC_TIME", "UTC_TIMESTAMP", "VALUE", "VALUES", "VAR", "VARBINARY", "VARCHAR", "VARCHAR_IGNORECASE", "VARCHARACTER", "VARIABLES", "VARP", "VARYING", "WARNINGS", "WEEK", "WHEN", "WHERE", "WHILE", "WITH", "WORK", "WRITE", "WRITE_DELAY", "XOR", "YEAR", "YEAR_MONTH", "ZEROFILL" ) ); /** * accumulates any of the various types of comment */ private static final StringBuilder accumulated = new StringBuilder( 80 ); /** * how far we are through parsing the program */ private static int charIndex; /** * how the next next character will be treated, usually consumed and stuffed in a buffer, or forwarded to the next * state to deal with. */ private static HowToProcess how; /** * count of how many new lines encountered, shared by several states */ private static int nlCount; /** * the program or program fragment we are parsing */ private static String program; /** * the length of the program fragment we are parsing. */ private static int size; /** * count of how many spaces encountered. */ private static int spaceCount; /** * add a token to the end of the list to be rendered. * * @param t a token. Useless tokens will be not be added. */ private static void addToken( Token t ) { // check out token for validity if ( !t.isUseless() ) { tokens.add( t ); } } /** * calculate depth of {} () and [] */ private static void calcNestingDepths() { int braceDepth = 0; int parenDepth = 0; int bracketDepth = 0; for ( Token t : tokens ) { if ( !( t instanceof Fence ) ) { continue; } Fence b = ( Fence ) t; char c = b.getChar(); switch ( c ) { case '{': b.setNestingDepth( ++braceDepth ); break; case '}': b.setNestingDepth( braceDepth-- ); break; case '(': b.setNestingDepth( ++parenDepth ); break; case ')': b.setNestingDepth( parenDepth-- ); break; case '[': b.setNestingDepth( ++bracketDepth ); break; case ']': b.setNestingDepth( bracketDepth-- ); break; default: throw new IllegalStateException( "SQLTokenizer.calcNestingDepths: invalid fence character" ); } } // end for } // end calcNestingDepths /** * Replace junk chars with something that won't cause trouble * * @param category category of this char * @param c the char * * @return c if char is clean, a replacement if it were dirty e.g. tab */ private static char clean( SQLCharCategory category, char c ) { if ( category == SQLCharCategory.SPACE ) { return ' '; } else { return c; } } /** * crunch multiple tokens into a single token where feasible. */ private static void crunch() { int size; do { size = tokens.size(); for ( int i = size - 1; i >= 1; i-- ) { Token current = tokens.get( i ); Token prev = tokens.get( i - 1 ); if ( current instanceof NL && prev instanceof Space ) { // trim trailing blanks on line tokens.remove( i - 1 ); // don't i--. same NL token will be repeatedly compared // against prev. } else if ( current instanceof Space ) { Space sp = ( Space ) ( current ); if ( sp.length() <= 10 && prev.isCollapsible() ) { // combine space into previous token prev.setText( prev.getText() + sp.getText() ); tokens.remove( i ); // don't i--, combined token will be compared against // its predecessor } } else if ( prev.isCollapsible( current ) ) { // combine two tokens into one prev.setText( prev.getText() + current.getText() ); tokens.remove( i ); // don't i--. combined token will be compared with its // predecessor } } // end for // keep going while it is still finding something to crunch } while ( tokens.size() < size ); } /** * debugging dump system state * * @param theChar char we are processing * @param category category of the char * @param first did we just enter this state * @param oldState old state * @param state current state * @param newState next state * @param how do we consume, forward or discard this character. */ private static void dumpState( char theChar, SQLCharCategory category, boolean first, SQLState oldState, SQLState state, SQLState newState, HowToProcess how ) { if ( how == HowToProcess.CONSUME ) { /* use slightly abbreviated form, black */ out.println( theChar + " " + category + " " + first + " o:" + oldState + " s:" + state + " n:" + newState ); } else {/* in red */ err.println( theChar + " " + category + " " + first + " o:" + oldState + " s:" + state + " n:" + newState + " " + how ); } } /** * Make sure nothing there is left accumulated in buffers from parsing not yet converted to tokens */ private static void ensureNoLeftovers() { assert accumulated.length() == 0 : " residual : " + accumulated.toString(); accumulated.setLength( 0 ); } /** * Is a given string isComing up in the stream starting with this character. Compares ignoring case. * * @param expected string to TEST if isComing in the stream * * @return true if this string isComing up, case-insensitive */ private static boolean isComing( String expected ) { // check if there are enough characters left for a match. return charIndex + expected.length() < size && program.substring( charIndex, charIndex + expected.length() ) .equalsIgnoreCase( expected ); } /** * clear out the state machine ready to parse a new program */ private static void reset() { nlCount = 0; spaceCount = 0; accumulated.setLength( 0 ); tokens.clear(); how = null; charIndex = 0; // keeping certain variables local or global is crucial. // Don't mess with them without thinking carefully and // changing the docs: // local: category, first, oldState, state, theChar // global: charIndex, how // we make these local to discourage accidental snooping or // accidentally picking up the static version instead of the parm. } /** * get rid of leading and trailing NLs tokens. It is easier to handle it later than during parsing. */ private static void trimNLs() { // remove leading NLs. while ( tokens.size() > 0 && ( tokens.get( 0 ) instanceof NL ) ) { tokens.remove( 0 ); } // remove trailing NLs int count; while ( ( count = tokens.size() ) > 0 && ( tokens.get( count - 1 ) instanceof NL ) ) { tokens.remove( count - 1 ); } // we don't need an NL at either beginning or end // insert at the beginning tokens.add( 0, new Start( "
" ) );
        // add to end
        addToken( new Stop( "
" ) ); } // end trimNLs /** * Default Consume one character. It has been predecided that you can and will consume it. * * @param c char to consume */ abstract void consume( char c ); /** * default what to do on leaving state, after last char is consumed. */ abstract void leaving(); // table of important SQL_KEYWORDS lives in Keywords class. /** * default next method determines the next state based on current state, and next char * * @param category class of next character * @param nextChar next character to process * @param first true if this is the first character after we entered this state. * * @return next SQLState * D o n o t m a k e p r i v a t e ! ! ! */ @SuppressWarnings( { "WeakerAccess" } ) SQLState next( SQLCharCategory category, char nextChar, boolean first ) { /* default way to recognise next state */ assert how == HowToProcess.FORWARD : "default next used without forwarding"; switch ( category ) { case PUNCTUATION: if ( isComing( "--" ) ) { // treat -- comment just like // comment return IN_REMSLASHSLASH; } else { return IN_OPERATOR; } case EOL: return AT_END_OF_LINE; case FENCE: return IN_FENCE; case OTHER: case PLAIN: return IN_NAME; case SLASH: if ( isComing( "//" ) ) { return IN_REMSLASHSLASH; } else if ( isComing( "/*" ) ) { return IN_REMSLASHSTAR;// includes /** } else { return IN_OPERATOR; } case STAR: return IN_OPERATOR; case TICK: return IN_TICKS; case SPACE: return IN_WHITESPACE; case IGNORE:// should never get this far default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } /** * Parse program and leave a list of Tokens in tokens ArrayList. * * @param program the text we are going parse and eventually render. Its loop calls next on oldstate, then * optionally consume on the old state, State whose next made the consume decision consumes it own * char. then optionally leaving on the old state * * @return an array of tokens representing the text and how it will be rendered. */ @SuppressWarnings( { "UnusedAssignment" } ) public static Token[] parse( String program ) { reset(); SQLState.program = program; size = program.length(); /* * keeping certain variables local or global is crucial. Don't mess with * them without thinking carefully and changing the docs: LOCAL: * category, first, oldState, state, theChar. GLOBALl: charIndex, how we * make these local to discourage accidental snooping or accidentally * picking up the static version instead of the parm. */ // where we were SQLState oldState = AT_END_OF_LINE; // where we are SQLState state = AT_END_OF_LINE; // were we will be next SQLState newState; // how is global however, so next can return both a state and // how. how = null; // Note, NO int charIndex !! Don't "repair that". // charIndex is a static variable globally known so "isComing" can // use // it. for ( charIndex = 0; charIndex < size; charIndex++ ) { // next char to process char theChar = program.charAt( charIndex ); // decide which general category the char falls in final SQLCharCategory category = SQLCharCategory .categorise( theChar ); theChar = clean( category, theChar ); if ( category != SQLCharCategory.IGNORE ) { /* * keep going till some state consumes/discards the character. * Allow up to three forwarding attempts to deal with the * character. Usually we should succeed on the first or second * attempt. We always make at least on trip through */ attempts: for ( int times = 0; times < 3; times++ ) { // first is deliberately local final boolean first = state != oldState; /* * crank the state machine one cycle, State should modify * how in addition to returning the new state. A little ugly * but simplest way to return a pair of values: state and * how. */ how = null; /* * setting to null ensures not setting it will be caught. */ /* * This is the guts of the finite state automaton decide the * next state */ // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv newState = state.next( category, theChar, first ); // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ if ( DEBUGGING ) { dumpState( theChar, category, first, oldState, state, newState, how ); } // kick over to the next generation, // we are now in the newState. We make the // transition // here. oldState = state; state = newState; newState = null; if ( how == null ) { throw new NullPointerException( "SQLState bug: how not set. OldState=" + oldState + " " + "newState=" + state + " next() must not be private." ); } switch ( how ) { case CONSUME: oldState.consume( theChar ); if ( state != oldState ) { oldState.leaving(); } // and on to the next char break attempts; case DISCARD: if ( state != oldState ) { oldState.leaving(); } // and on to the next char break attempts; default: // should never get here assert false : "SQLTokenizer state machine failed to set how variable."; break attempts; case FORWARD: assert state != oldState : "SQLTokenizer state machine attempted to forward a char to the same state."; oldState.leaving(); /* * we give that character another try with the new * state */ } // end switch /* * we will only ever get here if we are forwarding. The * other case leave the loop early. */ } // end attempts loop // we fall out the bottom and land here no matter what assert how == HowToProcess.CONSUME || how == HowToProcess .DISCARD : "SQLTokenizer state machine failed to consume char in three state forwarding attempts."; } // end if ignore } // end for each character if ( state == oldState ) { // force a final wrapup -- e.g. slash star comment without // terminator oldState.leaving(); } /* * make sure nothing still sitting in accumulation buffer after we have * finished parsing the entire program. */ ensureNoLeftovers(); crunch(); // tidy up the list of tokens. trimNLs(); calcNestingDepths(); // covert to vanilla array for even more efficient use in the // final // Applet. return tokens.toArray( new Token[ tokens.size() ] ); } // end parse// end default next } // end SQLState