/* * [Compactor.java] * * Summary: Compacts HTML by removing unnecessary white space. * * Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.2 2006-03-15 Suppress IntelliJ Code Analyse that wants to make this default scope. * 2.3 2008-02-15 complete rewrite, mainly to handle removing space around
..
. * 3.1 2010-12-21 avoid touching JavaScript and other scripts. * 3.2 2010-12-24 handle * We always compact whitespace inside and outside comments. * * We don't consolidate tags. e.g. * <span class="x">this </span><span class="x">and that</span> can be * collapsed * to <span class="x">this and that</span>. * * We don't convert tags to lower case e.g. <BR> to <br> * * We leave all comments in place. If ever such a feature is implemented, it must * not strip SSI comments. It may or may not leave macro comments. * * We do not remove macro generations. You can do that with StripGenerated. * * We do not remove the macro comments. * * We remove space and NLs on the right of <div><dt><li><h?><ol><table>< * tbody><td><th><thead><tr><ul> tags. * * We remove space and NLs on the lift of </div></dt></li></h?></ol></table>< * /tbody></td></th></thead></tr></ul> tags. * * We always remove lead and trailing spaces from lines. * * We compact spaces in side HTML text, tags and comments. * * We leave spaces as is inside <pre>...</pre> and inside quoted tag parameters. * * We convert " to " > to > when used in raw text. * * We don't tokenize to convert to CBF, compact binary format. The catch here is web * browsers can't read the result without a plug-in. This would result in a major * compaction. Perhaps the XML folk will eventually get disgusted with their obese * format and XHTML can inherit a now compact form. * * We don't do any LZW compression. the catch is, browsers can't read this without a * special plug-in. * * @author Roedy Green, Canadian Mind Products * @version 3.9 2014-07-26 internal simplication. Now preserves space both before and after comment which may be significant. * @see com.mindprod.compactor.HTMLState * @since 2006 */ public class Compactor { /** * keep macrcos and SSI and PAD comments */ static final Pattern MACRO_PATTERN = Pattern.compile( "(?:\\s*(?:macro|generated|/generated|alias|cross|PAD|/PAD))|#" ); /** * 4.4 */ static final Pattern JUST_SSI_PATTERN = Pattern.compile( "(?:\\s*(?:PAD|/PAD))|#" ); // todo: sometimes leaves \n between comments. Sometimes does not. Should be consistent. private static final int FIRST_COPYRIGHT_YEAR = 2006; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; /** * date this version was released. */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String RELEASE_DATE = "2014-07-26"; /** * how to use the command line */ private static final String USAGE = "\nCompactor needs a filename.html or a space-separated list of filenames, " + "with optional -s -q -v switches."; /** * embedded version string. */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String VERSION_STRING = "3.9"; /** * constructor */ public Compactor() { } /** * compact and tidy one file. * * @param quiet true if want progress messages suppressed * @param fileBeingProcessed the file currently being processed. * * @throws IOException Suppress IntelliJ Code Analyse that wants to make this private. * @noinspection WeakerAccess, SameParameterValue, StringEquality */ public static void compactFile( boolean quiet, File fileBeingProcessed ) throws IOException { if ( !quiet ) { out.print( " compacting " + fileBeingProcessed.getName() + " " ); } switch ( EIO.getExtension( fileBeingProcessed ) ) { // this is just a double check on ExtensionListFilter in main case "html": case "htm": case "htmlfrag": break; default: err.println( "Cannot compact: " + fileBeingProcessed.getName() + "not .html .htm .htmlfrag file" ); return; } String big = HunkIO.readEntireFile( fileBeingProcessed ); // we don't allow stripping macros and comments. Doing it to original is dangerous without StripGenerated // balance checking String result = compactStringKeepingMacrosAndComments( big, fileBeingProcessed.getPath() ); // use == not equals() because compare already done in compactStringKeepingMacrosAndComments. if ( result == big ) { // nothing changed. No need to write results. if ( !quiet ) { out.println( "-" ); } return; } // generate output into a temporary file until we are sure all is ok. // create a temp file in the same directory as filename if ( !quiet ) { out.println( "*" ); } final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed ); FileWriter emit = new FileWriter( tempFile ); emit.write( result ); emit.close(); HunkIO.deleteAndRename( tempFile, fileBeingProcessed ); } // end compactFile /** * compact a String as needed * * @param uncompacted uncompacted string * @param where where this string came from, used in error messages to help you track down source * @param how *=compactStringStrippingMacrosAndComments, including * +=compactStringKeepingMacrosAndComments * -=does nothing * Q=Quick If first 400 chars contain a double space, compactStringKeepingMacrosAndComments, * otherwise do nothing. * * @return compacted String */ public static String compactStringAsNeeded( final String uncompacted, final String where, final char how ) { switch ( how ) { case '*': return Compactor.compactStringStrippingMacrosAndComments( uncompacted, where ); case '+': return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where ); case '-': return uncompacted; case 'Q': case 'q': final String test = ( uncompacted.length() < 400 ) ? uncompacted : uncompacted.substring( 0, 400 ); if ( test.contains( " " + " " ) ) { return uncompacted; } else { return Compactor.compactStringKeepingMacrosAndComments( uncompacted, where ); } default: assert false : "invalid Compactor.compactStringAsNeeded.how " + how + " It must be one of * + - Q"; return uncompacted; } } /** * Remove excess whitespace from HTML represented by string. * * @param big the String to compact. * @param where used in error messages to indicate where the error occurred, usually the name of the file being * compacted. * * @return the compacted String, big itself if nothing changed. */ public static String compactStringKeepingMacrosAndComments( final String big, final String where ) { return HTMLState.compactString( big, where, null ); } /** * Remove excess whitespace from HTML represented by string. * * @param big the String to compact. * @param where used in error messages to indicate where the error occurred, usually the name of the file being * compacted. * * @return the compacted String, big itself if nothing changed. */ public static String compactStringKeepingMacrosStrippingComments( final String big, final String where ) { return HTMLState.compactString( big, where, MACRO_PATTERN ); } /** * Remove excess whitespace from HTML represented by string, strip all macros and comments. * * @param big the String to compact. * @param where used in error messages to indicate where the error occurred, usually the name of the file being * compacted. * * @return the compacted String, big itself if nothing changed. */ public static String compactStringStrippingMacrosAndComments( final String big, final String where ) { return HTMLState.compactString( big, where, JUST_SSI_PATTERN ); } /** * compacts HTML files. * * @param args names of files to process, dirs, files, -s, *.*, no wildcards. */ public static void main( String[] args ) { // gather all the files mentioned on the command line. // either directories, files, *.*, with -s and subdirs option. out.println( "Gathering html files to compact..." ); CommandLine commandLine = new CommandLine( args, new AllButSVNDirectoriesFilter(), new ExtensionListFilter( ExtensionListFilter.COMMON_HTML_EXTENSIONS ) ); // There is a a double check on ExtensionListFilter in compactFile final boolean quiet = commandLine.isQuiet(); if ( commandLine.size() == 0 ) { throw new IllegalArgumentException( "No files found to process\n" + USAGE ); } final Compactor compactor = new Compactor(); for ( File file : commandLine ) { try { compactFile( quiet, file ); } catch ( FileNotFoundException e ) { err.println( "Error: " + EIO.getCanOrAbsPath( file ) + " not found." ); } catch ( Exception e ) { err.println(); e.printStackTrace( err ); err.println( " in file " + EIO.getCanOrAbsPath( file ) ); err.println(); } } // end for } // end main } // end Compactor