/*
* [HTMLState.java]
*
* Summary: HTML file parser, finite state automaton.
*
* Copyright: (c) 2004-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 3.1 2009-04-12 shorter style names, improved highlighting.
* 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method.
*/
/**
* State machine for HTMLTokenizer to parse HTML for colourising.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method. mark start and end of comments specially.
* Use simpler look-ahead logic with mark handled.
* @since 2004-05-15
*/
package com.mindprod.htmlreflow;
import com.mindprod.jprep.HowToProcess;
import static java.lang.System.*;
/**
* HTML file parser, finite state automaton.
*
* This parser is complicated by the nesting of HTML. You can have a tag
* inside body text, a string inside a tag and an entity inside a string. You can also have entities inside body text.
* Comments may be embedded in body text or tags, but not strings or entities. So, for example, an entity state must
* what what state to return to when the entity is over.
NL is not as significant as it is in other languages. You
* can have a NL in the middle of a tag or a string, but not an entity. The tags and strings token don't have embedded
* NLs, though they do during accumulation.
String tokens may be missing lead or trail quotes. We don't have
* END_OF_LINE or WHITESPACE states. We split out SPACE and NL tokens LATER
The notion of kosher = false means just
* that one char is not kosher but we let it slide and don't necessarily change state. However the split that naughty
* character off into a Gibberish token by itself for special highlighting.
The notion of balanced is similar. In
* that case the entire unbalanced entity, string etc is put into a Gibberish token.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.2 2010-02-08 handle attributes and CDATA with special attributes.
* add markHandled method. mark start and end of comments specially.
* Use simpler look-ahead logic with mark handled.
* @see com.mindprod.compactor.HTMLState
* @see com.mindprod.jprep.HTMLState
* @since 2004-07-17
*/
@SuppressWarnings( { "NestedAssignment", "UnnecessaryContinue", "EnumeratedConstantNamingConvention" } )
public enum HTMLState
{
/**
* in an entity &...; inside quotes
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_QUOTE_ENTITY
{
private boolean balanced;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
String entity = accumulated.toString();
accumulated.setLength( 0 );
assert !entity.startsWith( "&" ) : "entity should have & stripped";
assert !entity.endsWith( ";" ) : "entity should have ; stripped";
// no embedded spaces or nls to deal with.
if ( balanced )
{
// does not have lead & or trailing ;
// add even if chars 0 long. Could erroneously have written &;
sb.append( '&' );
sb.append( entity );
}
else
{
// screwed up entity that was not really one.
// should treat it really part of a quotation. It is an error.
sb.append( '&' );
sb.append( entity );
// note that we have handled this anomaly
balanced = true;
}
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
if ( first )
{
balanced = false;
// discard lead &
how = HowToProcess.DISCARD;
return IN_QUOTE_ENTITY;
}
else
{
// malformed entity
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
}
case DASH:
case END_TAG:
case EOL:
case OTHER:
case QUOTE:
case SPACE:
case START_TAG:
// oops. Should not see these inside an entity.
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
case EQUALS:
case PLAIN:
case RIGHT_BRACKET:
if ( SAFEENTITYCHARS.indexOf( nextChar ) >= 0 )
{
how = HowToProcess.CONSUME;
return IN_QUOTE_ENTITY;
}
else
{
balanced = false;
how = HowToProcess.FORWARD;
return IN_QUOTES;
}
case SEMICOLON:
balanced = true;
how = HowToProcess.DISCARD;
return IN_QUOTES;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
} // end next
// end IN_QUOTE_ENTITY
},
/**
* inside " ... " Use entities inside quotes to embed quote. Quotes only exist inside tags. We don't allow
* quotes to span an EOL.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_QUOTES
{
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// either lead or trail quotes could be missing.. May even have embedded
// nl. or strings of spaces
sb.append( accumulated.toString() );
accumulated.setLength( 0 );
// end leaving
}
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:
// entity inside quotes
how = HowToProcess.FORWARD;
return IN_QUOTE_ENTITY;
case DASH:
case EOL:// treat as part of quote, split it out later
case END_TAG:
case EQUALS:
case OTHER:
case PLAIN:
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:
case START_TAG:
how = HowToProcess.CONSUME;
return IN_QUOTES;
case QUOTE:
// handle the first " differently
if ( first )
{
how = HowToProcess.CONSUME;
return IN_QUOTES;
}
else
{
// last
how = HowToProcess.CONSUME;
return IN_TAG;
}
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_QUOTES
},
/**
* inside tag <xxx something="abc"... > inside attribute name, not value.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_ATTRIBUTE
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
sb.append( accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entity inside tag, oops
case OTHER:
case QUOTE:
case START_TAG:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
case END_TAG:
case EQUALS:
case EOL:
case SPACE:
// attribute is complete
how = HowToProcess.FORWARD;
return IN_TAG;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
case SEMICOLON:
// another char in attribute name.
how = HowToProcess.CONSUME;
return IN_ATTRIBUTE;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_ATTRIBUTE
},
/**
* inside tag <xxx something="abc"... > inside space inside a tag not inside quote
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG_SPACE
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
sb.append( accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:
case EQUALS:
case OTHER:
case QUOTE:
case SEMICOLON:
case START_TAG:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG_SPACE;
case END_TAG:
how = HowToProcess.FORWARD;
return IN_TAG;
case EOL:
case SPACE:
// still is space between attributes
how = HowToProcess.CONSUME;
return IN_TAG_SPACE;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
// another char in attribute name.
how = HowToProcess.FORWARD;
return IN_ATTRIBUTE;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_ATTRIBUTE
},
/**
* inside tag < ... > not inside tagged body text.
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG
{
/**
* a single char was detected that was erroneous. We mark it
* with a special token, but otherwise carry on as if it were a
* correct character.
*/
private boolean gibberish;
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// each chunk of the is a separate token.
// We have no mechanism to nest styles or tokens.
sb.append( accumulated.toString() );
accumulated.setLength( 0 );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entity inside tag, oops
case OTHER:
case SEMICOLON:
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
case START_TAG:
if ( first )
{
how = HowToProcess.CONSUME;
return IN_TAG;
}
else
{
// starting a new tag without finishing the previous one
gibberish = true;
how = HowToProcess.CONSUME;
return IN_TAG;
}
case END_TAG:
how = HowToProcess.CONSUME;
return IN_TEXT;
case DASH:
case PLAIN:
case RIGHT_BRACKET:
how = HowToProcess.CONSUME;
return IN_TAG;
case EQUALS:
sb.append( accumulated.toString() );
accumulated.setLength( 0 );
sb.append( "=" );
how = HowToProcess.DISCARD;
return IN_TAG;
case SPACE:
case EOL:// consume EOLs in side tags as part of tag, split them
// out
// later
how = HowToProcess.FORWARD;
return IN_TAG_SPACE;
case QUOTE:
how = HowToProcess.FORWARD;
return IN_QUOTES;
case IGNORE:
default:
assert false :
"bad state "
+ category
+ " "
+ nextChar;
return null;
} // end switch
// end next
}
// end IN_TAG
},
/**
* in comment <!-- ... --> in text, inside tag, same as IN_TEXT_COMMENT but returns to IN_TAG
*/
@SuppressWarnings( { "WeakerAccess" } ) IN_TAG_COMMENT
{
/**
* Consume one character. It has been predecided that you can
* and will consume it.
* @param c char to consume
*/
void consume( char c )
{
accumulated.append( c );
}
/**
* what to do on leaving state, after last char is consumed.
*/
void leaving()
{
// token added includes the begin and end markers, with possibly
// either end marker missing.
sb.append( "" );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
case EOL:// just accumulate them for now
case END_TAG: // without lead --, just an ordinary char
case EQUALS:
case OTHER:// don't sweat high ascii in comments.
case PLAIN:
case QUOTE:// quotes are nothing special in comments
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:// just accumulate them for now
how = HowToProcess.CONSUME;
return IN_TAG_COMMENT;
case DASH:
if ( isComing( "-->" ) )
{
markHandled( 3 );
// don't accumulate
how = HowToProcess.DISCARD;
return IN_TAG;
}
else
{
// just an ordinary char
how = HowToProcess.CONSUME;
return IN_TAG_COMMENT;
}
case START_TAG:
if ( first && isComing( "" );
} // end leaving
/**
* Figure out what state we should go in after we process this
* character and whether we can consume it. We must set consume
* = false if we cannot consume the character and have to
* postpone processing to the next state.
* @param category category of character
* @param nextChar character in the stream we are processing
* @param first true if we just entered this state.
* @return next state to go in
*/
HTMLState next( HTMLCharCategory category,
char nextChar,
boolean first )
{
switch ( category )
{
case AMP:// entities not magic in comments
case EOL:// just accumulate them for now
case END_TAG: // without lead --, just an ordinary char
case EQUALS:
case OTHER:// don't sweat high ascii in comments.
case PLAIN:
case QUOTE:// quotes are nothing special in comments
case RIGHT_BRACKET:
case SEMICOLON:
case SPACE:// just accumulate them for now
how = HowToProcess.CONSUME;
return IN_TEXT_COMMENT;
case DASH:
if ( isComing( "-->" ) )
{
markHandled( 3 );
// don't accumulate
how = HowToProcess.DISCARD;
return IN_TEXT;
}
else
{
// just an ordinary char
how = HowToProcess.CONSUME;
return IN_TEXT_COMMENT;
}
case START_TAG:
if ( first && isComing( "