/*
* [HTMLState.java]
*
* Summary: Finite state automaton parser to analyse HTML to remove excess whitespace.
*
* Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 2.8 2009-04-04 no longer correct missing entities. Just issue warning messages.
* 2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings.
* 3.0 2010-02-12 trim space inside
..
.
* 3.1 2010-12-21 avoid touching JavaScript and other scripts.
* 3.2 2010-12-24 handle
* Collapses multiple spaces in HTML text, tags and comments to one.
*
* Trims space from start and end of line.
*
* Removes whitespace after <dt...><h?...><li..><td...>
*
* Removes whitespace before </dt></h?></li></td>
*
* Leaves whitespace alone in <pre>...</pre>
*
* Leaves whitespace alone inside "..." in tags.
*
* Normalises newlines to \n.
*
* If there is whitespace before, or after a comment or a between multiple comments it will be collapsed to a single
* space or NL. Macro comments will not remove whitespace entirely before or after. They expand to text, so that
* whitespace is significant.
*
* We emit NLs when we first see one, and avoid emitting subsequent NLs. However, we procrastinate emitting space until
* we find the end of the space string. That way we can often eliminate the spaces altogether, replacing it with an NL.
*
* @author Roedy Green, Canadian Mind Products
* @version 3.6 2013-03-01 no longer complain about unescaped " in text.
* @see com.mindprod.htmlreflow.HTMLState
* @see com.mindprod.jprep.HTMLState
* @since 2009
*/
enum HTMLState
{
IN_COMMENT
{
/* between */
HTMLState next( HTMLCharCategory category, char nextChar )
{
switch ( category )
{
case BEGIN_TAG:
case END_TAG:
case QUOTE:
case TEXT:
emit.append( nextChar );
return IN_COMMENT;
case DASH:
if ( lookAhead( 2 ).equals( "->" ) )
{
// cheat, process 2 extra chars without using state machine
charIndex += 2;
emit.append( "-->" );
return IN_TEXT;
// pick up where we left off as if the comment never happened.
// return previousTextState;
}
else
{
emit.append( '-' );
return IN_COMMENT;
}
case IGNORE:
return IN_COMMENT;
case NL:
lineNumber++;
emit.append( '\n' );
return IN_COMMENT_REMOVABLE_SPACE;
case SPACE:
return IN_COMMENT_COMPACTIBLE_SPACE;
default:
throw new IllegalArgumentException( "program bug: invalid category" );
}
}
},
IN_COMMENT_COMPACTIBLE_SPACE
{
/* inside spaces in a comment, which can be collapsed down to a single space */
HTMLState next( HTMLCharCategory category, char nextChar )
{
switch ( category )
{
case BEGIN_TAG:
case END_TAG:
case QUOTE:
case TEXT:
emit.append( ' ' );
emit.append( nextChar );
return IN_COMMENT;
case IGNORE:
case SPACE:
return IN_COMMENT_COMPACTIBLE_SPACE;
case DASH:
if ( lookAhead( 2 ).equals( "->" ) )
{
// cheat, process 2 extra chars without using state machine
charIndex += 2;
emit.append( " -->" );
return IN_TEXT;
// pick up where we left off as if the comment never happened.
// return previousTextState;
}
else
{
emit.append( " -" );
return IN_COMMENT;
}
case NL:
// we don't suppress NLs inside comments.
lineNumber++;
emit.append( '\n' );
return IN_COMMENT_REMOVABLE_SPACE;
default:
throw new IllegalArgumentException( "program bug: invalid category" );
}
}
},
IN_COMMENT_REMOVABLE_SPACE
{
/* inside spaces in a comment, after a newline leading on a line,
will be totally deleted. */
HTMLState next( HTMLCharCategory category, char nextChar )
{
switch ( category )
{
case BEGIN_TAG:
case END_TAG:
case QUOTE:
case TEXT:
emit.append( nextChar );
return IN_COMMENT;
case DASH:
if ( lookAhead( 2 ).equals( "->" ) )
{
// cheat, process 2 extra chars without using state machine
charIndex += 2;
emit.append( "-->" );
return IN_TEXT;
// pick up where we left off as if the comment never happened.
// return previousTextState;
}
else
{
emit.append( '-' );
return IN_COMMENT;
}
case IGNORE:
case SPACE:
return IN_COMMENT_REMOVABLE_SPACE;
case NL:
lineNumber++;
return IN_COMMENT_REMOVABLE_SPACE;
default:
throw new IllegalArgumentException( "program bug: invalid category" );
}
}
},
IN_REMOVABLE_SPACE
{
/* inside lead spaces on line of ordinary text, or after ....
Whitespace that will disappear entirely */
HTMLState next( HTMLCharCategory category, char nextChar )
{
switch ( category )
{
case BEGIN_TAG:
tagCategory =
// allow for / and >, < already parsed.
TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG +
2 ) ) );
switch ( tagCategory )
{
case COMMENT:
charIndex += "!--".length();
// Will be further later incremented by 1 by charIndexLoop
// Record what we were doing so we can pick up where we left off after comment.
if ( isKeeperComment() )
{
// we used to avoid compacting whitespace in comments
// emit.append( " we jump ahead when hit . We are removing this comment
entirely */
HTMLState next( HTMLCharCategory category, char nextChar )
{
switch ( category )
{
case BEGIN_TAG:
case END_TAG:
case IGNORE:
case QUOTE:
case SPACE:
case TEXT:
// ignore everything.
return STRIPPING_COMMENT;
case DASH:
if ( lookAhead( 2 ).equals( "->" ) )
{
// cheat, process 2 extra chars without using state machine
charIndex += 2;
return IN_TEXT;
// pick up where we left off as if the comment never happened.
// return previousTextState;
}
else
{
return STRIPPING_COMMENT;
}
case NL:
lineNumber++;
return STRIPPING_COMMENT;
default:
throw new IllegalArgumentException( "program bug: invalid category" );
}
}
};
// declarations
/**
* true if want debugging output
*/
private static final boolean DEBUGGING = false;
/**
* longest tag that will compress spaces either side of
*/
private static final int LONGEST_COMPRESSIBLE_TAG = "blockquote".length();
/**
* big input string we are parsing
*/
private static String big;
/**
* offset in big where we are processing
*/
private static int charIndex;
/**
* where we accumulate the compacted output. Leave as StringBuilder.
*/
private static StringBuilder emit;
/**
* line number we are processing in the output file. 1-based.
*/
private static int lineNumber;
/**
* lookingAt pattern to recognise SSI-style comments that expand to text, e.g. SSI that must be kept to matter what.
*/
private static Pattern keepPattern;
// --Commented out by Inspection START (2014-07-26 6:04 AM):
// /**
// * lets us remember what we were doing before the comment so we can pick up where we left off
// */
// private static HTMLState previousTextState;
// --Commented out by Inspection STOP (2014-07-26 6:04 AM)
/**
* category of the most recently encountered tag
*/
private static TagCategory tagCategory;
/**
* used in error messages to indicate where the error occurred, usually the name of the file being compacted.
*/
private static String where;
// /declarations
// methods
/**
* Check configuration and regex patterns to decide if this is a macro style comment that must be preserved even if
* other comments are stripped and whose lead and trail whitespace cannot be totally deleted.
*
* @return true pointing to a comment we want to keep
*/
private static boolean isKeeperComment()
{
if ( keepPattern == null )
{
return true; // we keep everything
}
// we have have only incremented past 3 of the 4 lead chars yet.
return ( keepPattern.matcher( big.substring( charIndex + 1 ) ).lookingAt() );
} // /method
/**
* look at chars ahead in the stream yet to be processed, starting at charIndex+1
*
* @param howFar how many chars you want
*
* @return 0 to howFar chars.
*/
private static String lookAhead( int howFar )
{
final int start = charIndex + 1;
final int end = Math.min( start + howFar, big.length() );
if ( start >= end )
{
return "";
}
else
{
return big.substring( start, end );
}
} // /method
/**
* parse candidate tag.
*
* @param partialTag first LONGEST_COMPRESSIBLE_TAG+2 chars of the tag, possibly including trailing space or > and
* other junk, without lead <.
*
* @return tag with < > and trailing white space stripped e.g. dt, /dt ,!--, /blockquote
*/
private static String parsePartialTag( final String partialTag )
{
// | this stuff " +
"\n" +
" | \n" +
" sit still very still | " +
"\n XXXXX" +
"
\n" +
"4.4" +
" contents \n" +
" \n" +
" stuff2 \n" +
" x stuff3 \n" +
" x stuff \n" +
"" +
"xx yyzz" +
"> stray gt; ";
out.println( "--------RAW---------- keep comments:\n [" + test + "]" );
out.println( "--------COOKED ------ keep comments:\n [" + compactString( test, "in RAM test", null ) + "]" );
out.println( "--------RAW---------- keep macros:\n [" + test + "]" );
out.println( "--------COOKED ------ keep macros:\n [" + compactString( test, "in RAM test", Compactor.MACRO_PATTERN ) + "]" );
out.println( "--------RAW---------- keep all but ssi:\n [" + test + "]" );
out.println( "--------COOKED ------ keep all but ssi:\n [" + compactString( test, "in RAM test", Compactor.JUST_SSI_PATTERN ) + "]" );
}
} // /method
// /method
// /methods
}