/* * [HTMLState.java] * * Summary: State machine for HTMLTokenizer to parse HTML for colourising. * * Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 3.1 2009-04-12 shorter style names, improved highlighting. * 3.2 2010-02-08 handle attributes and CDATA with special attributes. * add markHandled method. */ /** * State machine for HTMLTokenizer to parse HTML for colourising. * * @author Roedy Green, Canadian Mind Products * @version 3.2 2010-02-08 handle attributes and CDATA with special attributes. * add markHandled method. mark start and end of comments specially. * Use simpler look-ahead logic with mark handled. * @since 2004-05-15 */ package com.mindprod.jprep; import com.mindprod.common18.ST; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.jtokens.Gibberish; import com.mindprod.jtokens.NL; import com.mindprod.jtokens.Operator; import com.mindprod.jtokens.Space; import com.mindprod.jtokens.Start; import com.mindprod.jtokens.Stop; import com.mindprod.jtokens.Token; import com.mindprod.jtokens.html.CDATAContents; import com.mindprod.jtokens.html.CDATATag; import com.mindprod.jtokens.html.HTMLAttribute; import com.mindprod.jtokens.html.HTMLCommentContents; import com.mindprod.jtokens.html.HTMLCommentTag; import com.mindprod.jtokens.html.HTMLEntity; import com.mindprod.jtokens.html.HTMLStringLiteral; import com.mindprod.jtokens.html.HTMLTag; import com.mindprod.jtokens.html.HTMLText; import java.util.ArrayList; import static java.lang.System.*; /** * State machine for HTMLTokenizer to parse HTML for colourising. *

* This parser is complicated by the nesting of HTML. You can have a tag * inside body text, a string inside a tag and an entity inside a string. You can also have entities inside body text. * Comments may be embedded in body text or tags, but not strings or entities. So, for example, an entity state must * what what state to return to when the entity is over.
NL is not as significant as it is in other languages. You * can have a NL in the middle of a tag or a string, but not an entity. The tags and strings token don't have embedded * NLs, though they do during accumulation.
String tokens may be missing lead or trail quotes. We don't have * END_OF_LINE or WHITESPACE states. We split out SPACE and NL tokens LATER
The notion of kosher = false means just * that one char is not kosher but we let it slide and don't necessarily change state. However the split that naughty * character off into a Gibberish token by itself for special highlighting.
The notion of balanced is similar. In * that case the entire unbalanced entity, string etc is put into a Gibberish token. * * @author Roedy Green, Canadian Mind Products * @version 3.2 2010-02-08 handle attributes and CDATA with special attributes. * add markHandled method. mark start and end of comments specially. * Use simpler look-ahead logic with mark handled. * @see com.mindprod.compactor.HTMLState * @see com.mindprod.htmlreflow.HTMLState * @since 2004-07-17 */ @SuppressWarnings( { "NestedAssignment", "UnnecessaryContinue", "EnumeratedConstantNamingConvention" } ) public enum HTMLState { /** * in an entity &...; inside quotes */ @SuppressWarnings( { "WeakerAccess" } ) IN_QUOTE_ENTITY { private boolean balanced; /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { accumulated.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { String entity = accumulated.toString(); accumulated.setLength( 0 ); // no embedded spaces or nls to deal with. if ( balanced ) { // does not have lead & or trailing ; // Known includes hex and numeric forms. // case-sensitive boolean known = DeEntifyStrings.bareHTMLEntityToChar( entity, ' ' ) != 0; // add even if chars 0 long. Could erroneously have written &; tokens.add( new HTMLEntity( entity /* &; stripped */, known ) ); } else { // screwed up entity that was not really one. // should treat it really part of a quotation. It is an error.. tokens.add( new Gibberish( "&" + entity ) ); // note that we have handled this anomaly balanced = true; } } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP:// entities not magic in comments if ( first ) { balanced = false; // discard lead & how = HowToProcess.DISCARD; return IN_QUOTE_ENTITY; } else { // malformed entity balanced = false; how = HowToProcess.FORWARD; return IN_QUOTES; } case DASH: case END_TAG: case EOL: case OTHER: case QUOTE: case SPACE: case START_TAG: // oops. Should not see these inside an entity. balanced = false; how = HowToProcess.FORWARD; return IN_QUOTES; case EQUALS: case PLAIN: case RIGHT_BRACKET: if ( SAFEENTITYCHARS.indexOf( nextChar ) >= 0 ) { how = HowToProcess.CONSUME; return IN_QUOTE_ENTITY; } else { balanced = false; how = HowToProcess.FORWARD; return IN_QUOTES; } case SEMICOLON: balanced = true; how = HowToProcess.DISCARD; return IN_QUOTES; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch } // end next // end IN_QUOTE_ENTITY }, /** * inside " ... " Use entities inside quotes to embed quote. Quotes only exist inside tags. We don't allow * quotes to span an EOL. */ @SuppressWarnings( { "WeakerAccess" } ) IN_QUOTES { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { accumulated.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { // either lead or trail quotes could be missing.. May even have embedded // nl. or strings of spaces addMultilineTokens( new HTMLStringLiteral( "" ), accumulated.toString() ); accumulated.setLength( 0 ); // end leaving } /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP: // entity inside quotes how = HowToProcess.FORWARD; return IN_QUOTE_ENTITY; case DASH: case EOL:// treat as part of quote, split it out later case END_TAG: case EQUALS: case OTHER: case PLAIN: case RIGHT_BRACKET: case SEMICOLON: case SPACE: case START_TAG: how = HowToProcess.CONSUME; return IN_QUOTES; case QUOTE: // handle the first " differently if ( first ) { how = HowToProcess.CONSUME; return IN_QUOTES; } else { // last how = HowToProcess.CONSUME; return IN_TAG; } case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next } // end IN_QUOTES }, /** * inside tag <xxx something="abc"... > inside attribute name, not value. */ @SuppressWarnings( { "WeakerAccess" } ) IN_ATTRIBUTE { /** * a single char was detected that was erroneous. We mark it * with a special token, but otherwise carry on as if it were a * correct character. */ private boolean gibberish; /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { if ( gibberish ) { // we must split off this single char into its own token, may be consolidated later. addMultilineTokens( new HTMLAttribute( "" ), accumulated.toString() ); accumulated.setLength( 0 ); addToken( new Gibberish( c ) ); gibberish = false; } else { accumulated.append( c ); } } /** * what to do on leaving state, after last char is consumed. */ void leaving() { addMultilineTokens( new HTMLAttribute( "" ), accumulated.toString() ); accumulated.setLength( 0 ); } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP:// entity inside tag, oops case OTHER: case QUOTE: case START_TAG: gibberish = true; how = HowToProcess.CONSUME; return IN_TAG; case END_TAG: case EQUALS: case EOL: case SPACE: // attribute is complete how = HowToProcess.FORWARD; return IN_TAG; case DASH: case PLAIN: case RIGHT_BRACKET: case SEMICOLON: // another char in attribute name. how = HowToProcess.CONSUME; return IN_ATTRIBUTE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next } // end IN_ATTRIBUTE }, /** * inside tag <xxx something="abc"... > inside space inside a tag not inside quote */ @SuppressWarnings( { "WeakerAccess" } ) IN_TAG_SPACE { /** * a single char was detected that was erroneous. We mark it * with a special token, but otherwise carry on as if it were a * correct character. */ private boolean gibberish; /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { if ( gibberish ) { // we must split off this single char into its own token, may be consolidated later. addMultilineTokens( new HTMLAttribute( "" ), accumulated.toString() ); accumulated.setLength( 0 ); addToken( new Gibberish( c ) ); gibberish = false; } else { accumulated.append( c ); } } /** * what to do on leaving state, after last char is consumed. */ void leaving() { addMultilineTokens( new Space(), accumulated.toString() ); accumulated.setLength( 0 ); } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP: case EQUALS: case OTHER: case QUOTE: case SEMICOLON: case START_TAG: gibberish = true; how = HowToProcess.CONSUME; return IN_TAG_SPACE; case END_TAG: how = HowToProcess.FORWARD; return IN_TAG; case EOL: case SPACE: // still is space between attributes how = HowToProcess.CONSUME; return IN_TAG_SPACE; case DASH: case PLAIN: case RIGHT_BRACKET: // another char in attribute name. how = HowToProcess.FORWARD; return IN_ATTRIBUTE; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next } // end IN_ATTRIBUTE }, /** * inside tag < ... > not inside tagged body text. */ @SuppressWarnings( { "WeakerAccess" } ) IN_TAG { /** * a single char was detected that was erroneous. We mark it * with a special token, but otherwise carry on as if it were a * correct character. */ private boolean gibberish; /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { if ( gibberish ) { // we must split off this single char into its own token addMultilineTokens( new HTMLTag( "" ), accumulated.toString() ); accumulated.setLength( 0 ); addToken( new Gibberish( c ) ); gibberish = false; } else { accumulated.append( c ); } } /** * what to do on leaving state, after last char is consumed. */ void leaving() { // each chunk of the is a separate token. // We have no mechanism to nest styles or tokens. addMultilineTokens( new HTMLTag( "" ), accumulated.toString() ); accumulated.setLength( 0 ); } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP:// entity inside tag, oops case OTHER: case SEMICOLON: gibberish = true; how = HowToProcess.CONSUME; return IN_TAG; case START_TAG: if ( first ) { how = HowToProcess.CONSUME; return IN_TAG; } else { // starting a new tag without finishing the previous one gibberish = true; how = HowToProcess.CONSUME; return IN_TAG; } case END_TAG: how = HowToProcess.CONSUME; return IN_TEXT; case DASH: case PLAIN: case RIGHT_BRACKET: how = HowToProcess.CONSUME; return IN_TAG; case EQUALS: addMultilineTokens( new HTMLTag( "" ), accumulated.toString() ); accumulated.setLength( 0 ); addToken( new Operator( '=' ) ); how = HowToProcess.DISCARD; return IN_TAG; case SPACE: case EOL:// consume EOLs in side tags as part of tag, split them // out // later how = HowToProcess.FORWARD; return IN_TAG_SPACE; case QUOTE: how = HowToProcess.FORWARD; return IN_QUOTES; case IGNORE: default: assert false : "bad state " + category + " " + nextChar; return null; } // end switch // end next } // end IN_TAG }, /** * in comment  in text, inside tag, same as IN_TEXT_COMMENT but returns to IN_TAG */ @SuppressWarnings( { "WeakerAccess" } ) IN_TAG_COMMENT { /** * Consume one character. It has been predecided that you can * and will consume it. * @param c char to consume */ void consume( char c ) { accumulated.append( c ); } /** * what to do on leaving state, after last char is consumed. */ void leaving() { // token added includes the begin and end markers, with possibly // either end marker missing. addToken( new HTMLCommentTag( "" ) ); } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP:// entities not magic in comments case EOL:// just accumulate them for now case END_TAG: // without lead --, just an ordinary char case EQUALS: case OTHER:// don't sweat high ascii in comments. case PLAIN: case QUOTE:// quotes are nothing special in comments case RIGHT_BRACKET: case SEMICOLON: case SPACE:// just accumulate them for now how = HowToProcess.CONSUME; return IN_TAG_COMMENT; case DASH: if ( isComing( "-->" ) ) { markHandled( 3 ); // don't accumulate how = HowToProcess.DISCARD; return IN_TAG; } else { // just an ordinary char how = HowToProcess.CONSUME; return IN_TAG_COMMENT; } case START_TAG: if ( first && isComing( "" ) ); } // end leaving /** * Figure out what state we should go in after we process this * character and whether we can consume it. We must set consume * = false if we cannot consume the character and have to * postpone processing to the next state. * @param category category of character * @param nextChar character in the stream we are processing * @param first true if we just entered this state. * @return next state to go in */ HTMLState next( HTMLCharCategory category, char nextChar, boolean first ) { switch ( category ) { case AMP:// entities not magic in comments case EOL:// just accumulate them for now case END_TAG: // without lead --, just an ordinary char case EQUALS: case OTHER:// don't sweat high ascii in comments. case PLAIN: case QUOTE:// quotes are nothing special in comments case RIGHT_BRACKET: case SEMICOLON: case SPACE:// just accumulate them for now how = HowToProcess.CONSUME; return IN_TEXT_COMMENT; case DASH: if ( isComing( "-->" ) ) { markHandled( 3 ); // don't accumulate how = HowToProcess.DISCARD; return IN_TEXT; } else { // just an ordinary char how = HowToProcess.CONSUME; return IN_TEXT_COMMENT; } case START_TAG: if ( first && isComing( "