/*
* [HTMLState.java]
*
* Summary: State machine for HTMLTokenizer to parse HTML for colourising.
*
* Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 3.1 2009-04-12 shorter style names, improved highlighting.
* 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method.
*/
/**
* State machine for HTMLTokenizer to parse HTML for colourising.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method. mark start and end of comments specially.
* Use simpler look-ahead logic with mark handled.
* @since 2004-05-15
*/
package com.mindprod.jprep;
import com.mindprod.common18.ST;
import com.mindprod.entities.DeEntifyStrings;
import com.mindprod.jtokens.Gibberish;
import com.mindprod.jtokens.NL;
import com.mindprod.jtokens.Operator;
import com.mindprod.jtokens.Space;
import com.mindprod.jtokens.Start;
import com.mindprod.jtokens.Stop;
import com.mindprod.jtokens.Token;
import com.mindprod.jtokens.html.CDATAContents;
import com.mindprod.jtokens.html.CDATATag;
import com.mindprod.jtokens.html.HTMLAttribute;
import com.mindprod.jtokens.html.HTMLCommentContents;
import com.mindprod.jtokens.html.HTMLCommentTag;
import com.mindprod.jtokens.html.HTMLEntity;
import com.mindprod.jtokens.html.HTMLStringLiteral;
import com.mindprod.jtokens.html.HTMLTag;
import com.mindprod.jtokens.html.HTMLText;
import java.util.ArrayList;
import static java.lang.System.*;
/**
* State machine for HTMLTokenizer to parse HTML for colourising.
*
* This parser is complicated by the nesting of HTML. You can have a tag
* inside body text, a string inside a tag and an entity inside a string. You can also have entities inside body text.
* Comments may be embedded in body text or tags, but not strings or entities. So, for example, an entity state must
* what what state to return to when the entity is over.
NL is not as significant as it is in other languages. You
* can have a NL in the middle of a tag or a string, but not an entity. The tags and strings token don't have embedded
* NLs, though they do during accumulation.
String tokens may be missing lead or trail quotes. We don't have
* END_OF_LINE or WHITESPACE states. We split out SPACE and NL tokens LATER
The notion of kosher = false means just
* that one char is not kosher but we let it slide and don't necessarily change state. However the split that naughty
* character off into a Gibberish token by itself for special highlighting.
The notion of balanced is similar. In
* that case the entire unbalanced entity, string etc is put into a Gibberish token.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method. mark start and end of comments specially.
* Use simpler look-ahead logic with mark handled.
* @see com.mindprod.compactor.HTMLState
* @see com.mindprod.htmlreflow.HTMLState
* @since 2004-07-17
*/
@SuppressWarnings( { "NestedAssignment", "UnnecessaryContinue", "EnumeratedConstantNamingConvention" } )
public enum HTMLState
{
/**
* in an entity &...; inside quotes
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_QUOTE_ENTITY
{
private boolean balanced;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
String entity = accumulated.toString();
accumulated.setLength( 0 );
// no embedded spaces or nls to deal with.
if ( balanced )
{
// does not have lead & or trailing ;
// Known includes hex and numeric forms.
// case-sensitive
boolean known = DeEntifyStrings.bareHTMLEntityToChar( entity, ' ' ) != 0;
// add even if chars 0 long. Could erroneously have written &;
tokens.add( new HTMLEntity( entity
/* &; stripped */,
known ) );
}
else
{
// screwed up entity that was not really one.
// should treat it really part of a quotation. It is an error..
tokens.add( new Gibberish( "&" + entity ) );
// note that we have handled this anomaly
balanced = true;
}
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
if ( first )
{
balanced = false;
// discard lead &
how = HowToProcess.DISCARD;
return IN_QUOTE_ENTITY;
}
else
{
// malformed entity
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
}
case DASH:
case END_TAG:
case EOL:
case OTHER:
case QUOTE:
case SPACE:
case START_TAG:
// oops. Should not see these inside an entity.
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
case EQUALS:
case PLAIN:
case RIGHT_BRACKET:
if ( SAFEENTITYCHARS.indexOf( nextChar ) >= 0 )
{
how = HowToProcess.CONSUME;
return IN_QUOTE_ENTITY;
}
else
{
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
}
case SEMICOLON:
balanced = true;
how = HowToProcess.DISCARD;
return IN_QUOTES;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
} // end next
// end IN_QUOTE_ENTITY
},
/**
* inside " ... " Use entities inside quotes to embed quote. Quotes only exist inside tags. We don't allow
* quotes to span an EOL.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_QUOTES
{
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// either lead or trail quotes could be missing.. May even have embedded
// nl. or strings of spaces
addMultilineTokens( new HTMLStringLiteral( "" ),
accumulated.toString() );
accumulated.setLength( 0 );
// end leaving
}
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:
// entity inside quotes
how = HowToProcess.FORWARD;
return IN_QUOTE_ENTITY;
case DASH:
case EOL:// treat as part of quote, split it out later
case END_TAG:
case EQUALS:
case OTHER:
case PLAIN:
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:
case START_TAG:
how = HowToProcess.CONSUME;
return IN_QUOTES;
case QUOTE:
// handle the first " differently
if ( first )
{
how = HowToProcess.CONSUME;
return IN_QUOTES;
}
else
{
// last
how = HowToProcess.CONSUME;
return IN_TAG;
}
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_QUOTES
},
/**
* inside tag <xxx something="abc"... > inside attribute name, not value.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_ATTRIBUTE
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
if ( gibberish )
{
// we must split off this single char into its own token, may be consolidated later.
addMultilineTokens( new HTMLAttribute( "" ),
accumulated.toString() );
accumulated.setLength( 0 );
addToken( new Gibberish( c ) );
gibberish = false;
}
else
{
accumulated.append( c );
}
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
addMultilineTokens( new HTMLAttribute( "" ),
accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entity inside tag, oops
case OTHER:
case QUOTE:
case START_TAG:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
case END_TAG:
case EQUALS:
case EOL:
case SPACE:
// attribute is complete
how = HowToProcess.FORWARD;
return IN_TAG;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
case SEMICOLON:
// another char in attribute name.
how = HowToProcess.CONSUME;
return IN_ATTRIBUTE;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_ATTRIBUTE
},
/**
* inside tag <xxx something="abc"... > inside space inside a tag not inside quote
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG_SPACE
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
if ( gibberish )
{
// we must split off this single char into its own token, may be consolidated later.
addMultilineTokens( new HTMLAttribute( "" ), accumulated.toString() );
accumulated.setLength( 0 );
addToken( new Gibberish( c ) );
gibberish = false;
}
else
{
accumulated.append( c );
}
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
addMultilineTokens( new Space(), accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:
case EQUALS:
case OTHER:
case QUOTE:
case SEMICOLON:
case START_TAG:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG_SPACE;
case END_TAG:
how = HowToProcess.FORWARD;
return IN_TAG;
case EOL:
case SPACE:
// still is space between attributes
how = HowToProcess.CONSUME;
return IN_TAG_SPACE;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
// another char in attribute name.
how = HowToProcess.FORWARD;
return IN_ATTRIBUTE;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_ATTRIBUTE
},
/**
* inside tag < ... > not inside tagged body text.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
if ( gibberish )
{
// we must split off this single char into its own token
addMultilineTokens( new HTMLTag( "" ), accumulated.toString() );
accumulated.setLength( 0 );
addToken( new Gibberish( c ) );
gibberish = false;
}
else
{
accumulated.append( c );
}
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// each chunk of the is a separate token.
// We have no mechanism to nest styles or tokens.
addMultilineTokens( new HTMLTag( "" ),
accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entity inside tag, oops
case OTHER:
case SEMICOLON:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
case START_TAG:
if ( first )
{
how = HowToProcess.CONSUME;
return IN_TAG;
}
else
{
// starting a new tag without finishing the previous one
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
}
case END_TAG:
how = HowToProcess.CONSUME;
return IN_TEXT;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
how = HowToProcess.CONSUME;
return IN_TAG;
case EQUALS:
addMultilineTokens( new HTMLTag( "" ), accumulated.toString() );
accumulated.setLength( 0 );
addToken( new Operator( '=' ) );
how = HowToProcess.DISCARD;
return IN_TAG;
case SPACE:
case EOL:// consume EOLs in side tags as part of tag, split them
// out
// later
how = HowToProcess.FORWARD;
return IN_TAG_SPACE;
case QUOTE:
how = HowToProcess.FORWARD;
return IN_QUOTES;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_TAG
},
/**
* in comment <!-- ... --> in text, inside tag, same as IN_TEXT_COMMENT but returns to IN_TAG
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG_COMMENT
{
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// token added includes the begin and end markers, with possibly
// either end marker missing.
addToken( new HTMLCommentTag( "" ) );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
case EOL:// just accumulate them for now
case END_TAG: // without lead --, just an ordinary char
case EQUALS:
case OTHER:// don't sweat high ascii in comments.
case PLAIN:
case QUOTE:// quotes are nothing special in comments
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:// just accumulate them for now
how = HowToProcess.CONSUME;
return IN_TAG_COMMENT;
case DASH:
if ( isComing( "-->" ) )
{
markHandled( 3 );
// don't accumulate
how = HowToProcess.DISCARD;
return IN_TAG;
}
else
{
// just an ordinary char
how = HowToProcess.CONSUME;
return IN_TAG_COMMENT;
}
case START_TAG:
if ( first && isComing( "" ) );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
case EOL:// just accumulate them for now
case END_TAG: // without lead --, just an ordinary char
case EQUALS:
case OTHER:// don't sweat high ascii in comments.
case PLAIN:
case QUOTE:// quotes are nothing special in comments
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:// just accumulate them for now
how = HowToProcess.CONSUME;
return IN_TEXT_COMMENT;
case DASH:
if ( isComing( "-->" ) )
{
markHandled( 3 );
// don't accumulate
how = HowToProcess.DISCARD;
return IN_TEXT;
}
else
{
// just an ordinary char
how = HowToProcess.CONSUME;
return IN_TEXT_COMMENT;
}
case START_TAG:
if ( first && isComing( "