/* * [PropState.java] * * Summary: properties file parser, finite state automaton. * * Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 3.1 2009-04-12 shorter style names, improved highlighting. */ package com.mindprod.jprep; import com.mindprod.common18.ST; import com.mindprod.jtokens.Gibberish; import com.mindprod.jtokens.NL; import com.mindprod.jtokens.Operator; import com.mindprod.jtokens.Start; import com.mindprod.jtokens.Stop; import com.mindprod.jtokens.Token; import com.mindprod.jtokens.Value; import com.mindprod.jtokens.prop.PropComment; import com.mindprod.jtokens.prop.PropKey; import com.mindprod.jtokens.prop.UnicodeLiteral; import java.util.ArrayList; import static java.lang.System.*; /** * properties file parser, finite state automaton. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2009-04-12 shorter style names, improved highlighting. * @since 2005-12-22 */ @SuppressWarnings( { "NestedAssignment", "UnnecessaryContinue", "EnumeratedConstantNamingConvention" } ) public enum PropState { /** * somebody noticed an EOL and forwarded it to us. The EOL will be the first character we consume. */ @SuppressWarnings( { "WeakerAccess" } ) AT_END_OF_LINE { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { nlCount++; } /** * what to do on leaving state, after last char is consumed. */ void leaving() { // emit pending nls if ( nlCount > 3 ) { nlCount = 3; } if ( nlCount > 0 ) { addToken( new NL( nlCount ) ); nlCount = 0; } } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @return next state to go in */ PropState next( PropCharCategory category, char nextChar ) { switch ( category ) { case BACKSLASH: case EQUALS: case ORDINARY: case OTHER: how = HowToProcess.FORWARD; return IN_KEYWORD; case SHARP: how = HowToProcess.FORWARD; return IN_COMMENT; case EOL: how = HowToProcess.CONSUME; return AT_END_OF_LINE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } // end next }, // end AT_END_OF_LINE /** * in comment # ... */ @SuppressWarnings( { "WeakerAccess" } ) IN_COMMENT { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { // includes lead # accumulatedComment.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { String comment = accumulatedComment.toString(); accumulatedComment.setLength( 0 ); if ( comment.length() != 0 ) { addToken( new PropComment( comment ) ); } } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @return next state to go in */ PropState next( PropCharCategory category, char nextChar ) { switch ( category ) { case BACKSLASH: if ( isUnicodeEscapeComing() ) { leaving(); processUnicodeEscape(); how = HowToProcess.DISCARD; return IN_COMMENT; } else { how = HowToProcess.CONSUME; return IN_COMMENT; } case OTHER: leaving(); addToken( new Gibberish( nextChar ) ); how = HowToProcess.DISCARD; return IN_COMMENT; case EQUALS: case ORDINARY: case SHARP: // Just treat as an ordinary char inside a comment. // stay in comment how = HowToProcess.CONSUME; return IN_COMMENT; case EOL: how = HowToProcess.FORWARD; return AT_END_OF_LINE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } // end next }, // end IN_KEYWORD /** * keyword=value */ @SuppressWarnings( { "WeakerAccess" } ) IN_KEYWORD { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { accumulatedKeyword.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { String name = accumulatedKeyword.toString(); accumulatedKeyword.setLength( 0 ); if ( name.length() > 0 ) { addToken( new PropKey( name ) ); } // end if } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @return next state to go in */ PropState next( PropCharCategory category, char nextChar ) { switch ( category ) { case ORDINARY: how = HowToProcess.CONSUME; return IN_KEYWORD; case BACKSLASH: if ( isUnicodeEscapeComing() ) { leaving(); processUnicodeEscape(); how = HowToProcess.DISCARD; return IN_KEYWORD; } else { how = HowToProcess.CONSUME; return IN_KEYWORD; } case EQUALS: leaving(); addToken( new Operator( '=' ) ); how = HowToProcess.DISCARD; return IN_VALUE; case OTHER: leaving(); addToken( new Gibberish( nextChar ) ); how = HowToProcess.DISCARD; return IN_KEYWORD; case SHARP: how = HowToProcess.FORWARD; return IN_COMMENT; case EOL: how = HowToProcess.FORWARD; return AT_END_OF_LINE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } // end next // end IN_KEYWORD }, /** * keyword=value */ @SuppressWarnings( { "WeakerAccess" } ) IN_VALUE { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { accumulatedValue.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { String name = accumulatedValue.toString(); accumulatedValue.setLength( 0 ); if ( name.length() > 0 ) { addToken( new Value( name ) ); } // end if } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @return next state to go in */ PropState next( PropCharCategory category, char nextChar ) { switch ( category ) { case ORDINARY: case EQUALS:// second equals is just ordinary char how = HowToProcess.CONSUME; return IN_VALUE; case BACKSLASH: if ( isUnicodeEscapeComing() ) { leaving(); processUnicodeEscape(); how = HowToProcess.DISCARD; return IN_VALUE; } else { how = HowToProcess.CONSUME; return IN_VALUE; } case OTHER: leaving(); addToken( new Gibberish( nextChar ) ); how = HowToProcess.DISCARD; return IN_VALUE; case SHARP: how = HowToProcess.FORWARD; return IN_COMMENT; case EOL: how = HowToProcess.FORWARD; return AT_END_OF_LINE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } // end next // end IN_VALUE }; /** * true if want voluminous debugging output */ private static final boolean DEBUGGING = false; /** * list of tokens we have parsed out. */ private static final ArrayList tokens = new ArrayList<>( 10000 ); // statics are shared common to all enum constants. // others are a separate field in each enum constant. /** * used to accumulate comments */ private static final StringBuilder accumulatedComment = new StringBuilder( 100 ); /** * accumulates left hand side of equal sign */ private static final StringBuilder accumulatedKeyword = new StringBuilder( 80 ); /** * accumulates right hand side of equal sign */ private static final StringBuilder accumulatedValue = new StringBuilder( 100 ); /** * how far we are through parsing the program */ private static int charIndex; /** * how we plan to process this character, consume, procrastinate to next state, or discard */ private static HowToProcess how; /** * count of how many new lines encountered, shared by several states */ private static int nlCount; /** * the program or program fragment we are parsing */ private static String program; /** * the length of the program fragment we are parsing. */ private static int size; /** * add a token to the end of the list to be rendered. * * @param t a token. Useless tokens will be not be added. */ private static void addToken( Token t ) { // check out token for validity, if pointless, don't bother adding it. if ( !t.isUseless() ) { tokens.add( t ); } } /** * crunch multiple tokens into a single token where feasible. */ private static void crunch() { int size; do { size = tokens.size(); for ( int i = size - 1; i >= 1; i-- ) { // this version does not deal with Space tokens. Token current = tokens.get( i ); Token prev = tokens.get( i - 1 ); if ( prev.isCollapsible( current ) ) { // combine two tokens into one prev.setText( prev.getText() + current.getText() ); tokens.remove( i ); // don't i--. combined token will be compared with its // predecessor } } // end for // keep going while it is still finding something to crunch } while ( tokens.size() < size ); } /** * debugging dump system state * * @param theChar char we are processing * @param category category of the char * @param first did we just enter this state * @param oldState old state * @param state current state * @param newState next state * @param how do we consume, forward or discard this character. */ private static void dumpState( char theChar, PropCharCategory category, boolean first, PropState oldState, PropState state, PropState newState, HowToProcess how ) { if ( how == HowToProcess.CONSUME ) { /* use slightly abbreviated form, black */ out.println( theChar + " " + category + " " + first + " o:" + oldState + " s:" + state + " n:" + newState ); } else { /* in red */ err.println( theChar + " " + category + " " + first + " o:" + oldState + " s:" + state + " n:" + newState + " " + how ); } } /** * Make sure nothing there is left accumulated in buffers from parsing not yet converted to tokens */ private static void ensureNoLeftovers() { assert accumulatedComment.length() == 0 : "comment residual : " + accumulatedComment.toString(); accumulatedComment.setLength( 0 ); assert accumulatedKeyword.length() == 0 : "keyword residual : " + accumulatedKeyword.toString(); accumulatedKeyword.setLength( 0 ); assert accumulatedValue.length() == 0 : "value residual : " + accumulatedValue.toString(); accumulatedValue.setLength( 0 ); } /** * Is there /uxxxx coming? * * @return true if valid hex unicode char in stream */ private static boolean isUnicodeEscapeComing() { if ( charIndex + 6 >= size ) { return false; } String suxxxx = program.substring( charIndex, charIndex + 6 ); // we know first char is backslash already. return suxxxx.charAt( 1 ) == 'u' && ST.isLegal( suxxxx.substring( 2, 6 ), "0123456789abcdefABCDEF" ); } /** * create token for /uxxxx Unicode Escape in stream. */ private static void processUnicodeEscape() { // we know the Unicode Escape is well formed addToken( new UnicodeLiteral( program.substring( charIndex, charIndex + 6 ) ) ); // we have just processed 5 more chars than usual with lookahead. // should refactor this with code such as used for CDATA charIndex += 5; } /** * clear out the state machine ready to parse a new program */ private static void reset() { nlCount = 0; accumulatedComment.setLength( 0 ); accumulatedKeyword.setLength( 0 ); accumulatedValue.setLength( 0 ); tokens.clear(); how = null; charIndex = 0; // keeping certain variables local or global is crucial. // Don't mess with them without thinking carefully and // changing the docs: // local: category, first, oldState, state, theChar // global: charIndex, how // we make these local to discourage accidental snooping or // accidentally picking up the static version instead of the parm. } /** * get rid of leading and trailing NLs tokens. It is easier to handle it later than during parsing. */ private static void trimNLs() { // remove leading NLs. while ( tokens.size() > 0 && ( tokens.get( 0 ) instanceof NL ) ) { tokens.remove( 0 ); } // remove trailing NLs int count; while ( ( count = tokens.size() ) > 0 && ( tokens.get( count - 1 ) instanceof NL ) ) { tokens.remove( count - 1 ); } // We don't need an NL at either beginning or end. // insert at the beginning tokens.add( 0, new Start( "
" ) );
        // add to end
        addToken( new Stop( "
" ) ); } /** * Default Consume one character. It has been predecided that you can and will consume it. * * @param c char to consume */ abstract void consume( char c ); /** * default what to do on leaving state, after last char is consumed. */ abstract void leaving(); /** * default next method determines the next state based on current state, and next char * * @param category class of next character * @param nextChar next character to process * * @return next PropCharCategoryState */ abstract PropState next( PropCharCategory category, char nextChar ); /** * Parse program and leave a list of Tokens in tokens ArrayList. * * @param program the text we are going parse and eventually render. * * @return an array of tokens representing the text and how it will be rendered. */ @SuppressWarnings( { "UnusedAssignment" } ) public static Token[] parse( String program ) { reset(); PropState.program = program; size = program.length(); // where we were PropState oldState = AT_END_OF_LINE; // where we are PropState state = AT_END_OF_LINE; // were we will be next PropState newState; // how is global however, so next can return both a state and how. how = null; // Note, NO int charIndex !! Don't "repair that". // charIndex is a static variable globally known so "coming" can use it. for ( charIndex = 0; charIndex < size; charIndex++ ) { // next char to process char theChar = program.charAt( charIndex ); // decide which general category the char falls in final PropCharCategory category = PropCharCategory .categorise( theChar ); if ( category != PropCharCategory.IGNORE ) { /* * keep going till some state consumes/discards the character. * Allow up to three forwarding attempts to deal with the * character. Usually we should succeed on the first or second * attempt. We always make at least on trip through */ attempts: for ( int times = 0; times < 3; times++ ) { // first is deliberately local final boolean first = state != oldState; /* * crank the state machine one cycle, State should modify * how in addition to returning the new state. A little ugly * but simplest way to return a pair of values: state and * how */ how = null;// setting to null ensures not setting it will // be caught. /* * This is the guts of the finite state automaton decide the * next state */ // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv newState = state.next( category, theChar ); // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ if ( DEBUGGING ) { dumpState( theChar, category, first, oldState, state, newState, how ); } // kick over to the next generation, // we are now in the newState. We make the transition here. oldState = state; state = newState; newState = null; if ( how == null ) { throw new NullPointerException( "PropState bug: how not set. OldState=" + oldState + " " + "newState=" + state + " next() must not be private." ); } switch ( how ) { case CONSUME: oldState.consume( theChar ); if ( state != oldState ) { oldState.leaving(); } // and on to the next char break attempts; case DISCARD: if ( state != oldState ) { oldState.leaving(); } // and on to the next char break attempts; default: // should never get here assert false : "PropTokenizer state machine failed to set how variable."; break attempts; case FORWARD: assert state != oldState : "PropTokenizer state machine attempted to forward a char to the same state."; oldState.leaving(); // we give that character another try with the new // state } // end switch /* * we will only ever get here if we are forwarding. The * other cases leave the loop early. */ } // end attempts loop // we fall out the bottom and land here no matter what assert how == HowToProcess.CONSUME || how == HowToProcess .DISCARD : "PropTokenizer state machine failed to consume char in three state forwarding attempts."; } // end if ignore } // end for each character // we must leave the last state if we have not already: if ( state == oldState ) { oldState.leaving(); } // make sure nothing still sitting in accumulation buffer after we have // finished parsing the entire program. ensureNoLeftovers(); // collapse tokens into fewer if possible crunch(); trimNLs(); // covert to vanilla array for even more efficient use in the final // Applet. return tokens.toArray( new Token[ tokens.size() ] ); } // end parse ; } // end PropState