/* * [HTMLState.java] * * Summary: Finite state automaton parser to analyse HTML to remove excess whitespace. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.8 2009-04-04 no longer correct missing entities. Just issue warning messages. * 2.9 2010-01-18 refactor so you first allocate a Compactor object, permitting simultaneous compactings. * 3.0 2010-02-12 trim space inside

..

. * 3.1 2010-12-21 avoid touching JavaScript and other scripts. * 3.2 2010-12-24 handle , < already parsed tagCategory = TagCategory.categorise( parsePartialTag( lookAhead( LONGEST_COMPRESSIBLE_TAG + 2 ) ) ); emit.append( nextChar ); if ( tagCategory == TagCategory.SLASH_SCRIPT ) { // treat rest of like ordinary tag return IN_TAG; } else { // tags in script treated a part of script return IN_SCRIPT; } case IGNORE: return IN_SCRIPT; case NL: lineNumber++; emit.append( '\n' ); return IN_SCRIPT; default: throw new IllegalArgumentException( "program bug: invalid category" ); } } }, IN_TAG { /* inside or or , * but not inside "..."*/ HTMLState next( HTMLCharCategory category, char nextChar ) { switch ( category ) { case BEGIN_TAG: if ( lookAhead( 3 ).equals( "!--" ) ) { err.println( "Compactor Error: Can't have we jump ahead when hit . We are removing this comment entirely */ HTMLState next( HTMLCharCategory category, char nextChar ) { switch ( category ) { case BEGIN_TAG: case END_TAG: case IGNORE: case QUOTE: case SPACE: case TEXT: // ignore everything. return STRIPPING_COMMENT; case DASH: if ( lookAhead( 2 ).equals( "->" ) ) { // cheat, process 2 extra chars without using state machine charIndex += 2; return IN_TEXT; // pick up where we left off as if the comment never happened. // return previousTextState; } else { return STRIPPING_COMMENT; } case NL: lineNumber++; return STRIPPING_COMMENT; default: throw new IllegalArgumentException( "program bug: invalid category" ); } } }; // declarations /** * true if want debugging output */ private static final boolean DEBUGGING = false; /** * longest tag that will compress spaces either side of */ private static final int LONGEST_COMPRESSIBLE_TAG = "blockquote".length(); /** * big input string we are parsing */ private static String big; /** * offset in big where we are processing */ private static int charIndex; /** * where we accumulate the compacted output. Leave as StringBuilder. */ private static StringBuilder emit; /** * line number we are processing in the output file. 1-based. */ private static int lineNumber; /** * lookingAt pattern to recognise SSI-style comments that expand to text, e.g. SSI that must be kept to matter what. */ private static Pattern keepPattern; // --Commented out by Inspection START (2014-07-26 6:04 AM): // /** // * lets us remember what we were doing before the comment so we can pick up where we left off // */ // private static HTMLState previousTextState; // --Commented out by Inspection STOP (2014-07-26 6:04 AM) /** * category of the most recently encountered tag */ private static TagCategory tagCategory; /** * used in error messages to indicate where the error occurred, usually the name of the file being compacted. */ private static String where; // /declarations // methods /** * Check configuration and regex patterns to decide if this is a macro style comment that must be preserved even if * other comments are stripped and whose lead and trail whitespace cannot be totally deleted. * * @return true pointing to a comment we want to keep */ private static boolean isKeeperComment() { if ( keepPattern == null ) { return true; // we keep everything } // we have have only incremented past 3 of the 4 lead chars yet. return ( keepPattern.matcher( big.substring( charIndex + 1 ) ).lookingAt() ); } // /method /** * look at chars ahead in the stream yet to be processed, starting at charIndex+1 * * @param howFar how many chars you want * * @return 0 to howFar chars. */ private static String lookAhead( int howFar ) { final int start = charIndex + 1; final int end = Math.min( start + howFar, big.length() ); if ( start >= end ) { return ""; } else { return big.substring( start, end ); } } // /method /** * parse candidate tag. * * @param partialTag first LONGEST_COMPRESSIBLE_TAG+2 chars of the tag, possibly including trailing space or > and * other junk, without lead <. * * @return tag with < > and trailing white space stripped e.g. dt, /dt ,!--, /blockquote */ private static String parsePartialTag( final String partialTag ) { // this stuff " + "\n" + " \n" + " sit still very still " + "\n XXXXX" + "
\n" + "4.4" + " contents \n" + "
\n" + " stuff2 \n" + " x stuff3 \n" + " x stuff \n" + "" + "xx yyzz" + "> stray gt; "; out.println( "--------RAW---------- keep comments:\n [" + test + "]" ); out.println( "--------COOKED ------ keep comments:\n [" + compactString( test, "in RAM test", null ) + "]" ); out.println( "--------RAW---------- keep macros:\n [" + test + "]" ); out.println( "--------COOKED ------ keep macros:\n [" + compactString( test, "in RAM test", Compactor.MACRO_PATTERN ) + "]" ); out.println( "--------RAW---------- keep all but ssi:\n [" + test + "]" ); out.println( "--------COOKED ------ keep all but ssi:\n [" + compactString( test, "in RAM test", Compactor.JUST_SSI_PATTERN ) + "]" ); } } // /method // /method // /methods }