/* * [MarkAcronyms.java] * * Summary: Replace known acronyms with invocations of the Acronym macro. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-10-25 initial version */ package com.mindprod.acronym; import com.mindprod.commandline.CommandLine; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.common18.Twirler; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.ExtensionListFilter; import com.mindprod.htmlmacros.macro.Global; import com.mindprod.htmlmacros.support.AcronymItem; import com.mindprod.htmlmacros.support.ConfigurationForMindprod; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Replace known acronyms with invocations of the Acronym macro. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2011-10-25 initial version * @since 2011-10-25 */ public class MarkAcronyms { private static final boolean DEBUGGING = false; /** * estimated max chunks on page each tag counts as three */ private static final int EST_NUMBER_OF_CHUNKS = 500000; private static final String A_SPAN = ".*?"; private static final String CLOSE_BODY = ""; private static final String CLOSE_GENERATED = ""; private static final String CLOSE_SEE = ""; private static final String COMMENT_SPAN = ""; private static final String DT_SPAN = ""; private static final String GENERIC_TAG = "<[a-zA-Z0-9 \\n!\"#\\$%&'\\(\\)\\*\\+," + "=\\-\\./;:=\\?@\\[\\\\\\]\\^_\\{\\|\\}~]+>"; private static final String OPEN_BODY = ""; private static final String OPEN_GENERATED = ""; private static final String OPEN_SEE1 = "
"; private static final String OPEN_SEE2 = "
"; /** * Pattern to divide page into chunks, some of which we replace Acronyms and some we do not. */ private static final Pattern CHUNK_BOUNDARY_PATTERN = Pattern.compile( "(" + A_SPAN + "|" + CLOSE_BODY + "|" + CLOSE_GENERATED + "|" + CLOSE_SEE + "|" + DT_SPAN + "|" + OPEN_BODY + "|" + OPEN_GENERATED + "|" + OPEN_SEE1 + "|" + OPEN_SEE2 + "|" + COMMENT_SPAN + // must come after // more specific " ); out.println( " >>> " + EIO.getCanOrAbsPath( fileBeingProcessed ) + " adding Acronym " + word ); return; } else if ( handleAlternativeAccumulatedWord( "s", word, sb, fileBeingProcessed ) ) { return; } else if ( handleAlternativeAccumulatedWord( "es", word, sb, fileBeingProcessed ) ) { return; } else if ( handleAlternativeAccumulatedWord( "’s", word, sb, fileBeingProcessed ) ) { return; } else if ( handleAlternativeAccumulatedWord( ".", word, sb, fileBeingProcessed ) ) { return; } else if ( handleAlternativeAccumulatedWord( "'/", word, sb, fileBeingProcessed ) ) { return; } } // the word is not an acronym, just leave it as is. sb.append( word ); } } /** * Handle word accumulated in wb, append it or append equivalent Acronym macro. * * @param tail string to ignore on tail end of word, to see if its an acronym * @param word word from html text that might be an acronym * @param sb Where we put the processed word * @param fileBeingProcessed file we are processing. * * @return true if successfully installed an acronym */ private static boolean handleAlternativeAccumulatedWord( final String tail, final String word, final StringBuilder sb, final File fileBeingProcessed ) { assert word.length() > 1 : "word length must be > 1"; if ( word.endsWith( tail ) ) { final String singular = ST.chopTrailingString( word, tail ); if ( AcronymItem.getAcronymItem( singular ) != null ) { // word is an acronym followed by an s, convert to a macro. Will have to manually cleanup up // "meaning" nearby. sb.append( "" ); sb.append( tail ); out.println( " >>> " + EIO.getCanOrAbsPath( fileBeingProcessed ) + " adding Acronym " + singular ); return true; } } return false; } /** * collect birth/death dates for each author from Book Macros * * @param args command line args, list of files containing quotations */ private static void repairAcronymsForAllFiles( final String[] args ) { // get files to process from command line. out.println( "Gathering files to scan for missing Acronym macros..." ); // do not process htmlfrag includes. CommandLine wantedFiles = new CommandLine( args, new NoSnippetsFilter(), new ExtensionListFilter( "html" ) ); out.println( "Checking all words to see if they are known acronyms. This will take a while..." ); final Twirler twirler = new Twirler(); for ( File fileBeingProcessed : wantedFiles ) { try { twirler.twirl( out ); // show progress final String originalPage = HunkIO.readEntireFile( fileBeingProcessed, HunkIO.UTF8 ); String modifiedPage = repairAcronymsForPage( originalPage, fileBeingProcessed ); if ( !modifiedPage.equals( originalPage ) ) { HunkIO.writeEntireFile( fileBeingProcessed, modifiedPage, HunkIO.UTF8 ); // HunkIO.writeEntireFile( EIO.getCanOrAbsPath( fileBeingProcessed ) + ".bak", // modifiedPage, HunkIO.UTF8 ); } } catch ( IOException e ) { e.printStackTrace( err ); err.println(); } } // end for to process each file } /** * Repair all macros on a single chunk * * @param originalChunk page of HTML for the chunk * @param fileBeingProcessed file we are repairing * * @return modified page. */ private static String repairAcronymsForChunk( String originalChunk, File fileBeingProcessed ) { // our chunk may contain entities. // out.println( " [" + originalChunk + "]" ); // for char by char work, StringBuilder is better than FastCat. // used at accumulate a word. final StringBuilder wb = new StringBuilder( 80 ); // used to accumulate the entire chunk final StringBuilder sb = new StringBuilder( originalChunk.length() + 200 ); for ( int i = 0; i < originalChunk.length(); i++ ) { char c = originalChunk.charAt( i ); // . / _ - not considered separators. They appear in the middle of acronyms. // & # ; are not considered separators. They appear in entities if ( c <= ' ' || ST.isLegal( c, "!\"$%^\'()*+,:<>?@[\\]^`{|}~" ) ) { handleAccumulatedWord( wb, sb, fileBeingProcessed ); // whether there was an accumulated word or not, we append the separator chars sb.append( c ); } else { // non-separator word char. wb.append( c ); } } // end while // might be something left over in word buffer. handleAccumulatedWord( wb, sb, fileBeingProcessed ); return sb.toString(); } /** * Repair all macros on a single page * * @param originalPage page of HTML * @param fileBeingProcessed file we are repairing * * @return modified page. */ private static String repairAcronymsForPage( String originalPage, File fileBeingProcessed ) { // we have to be inside ,,, boolean insideBody = false; // we must not be inside ... boolean insideGenerated = false; // we must not be in the SEE section boolean insideSee = false; // if we are processing included text there will be no final boolean including = EIO.getCanOrAbsPath( fileBeingProcessed ).contains( "include/" ); final FastCat sb = new FastCat( EST_NUMBER_OF_CHUNKS ); int prevEnd = 0; final Matcher m = CHUNK_BOUNDARY_PATTERN.matcher( originalPage ); while ( m.find() ) { // process the chunk before the match. final String chunk = originalPage.substring( prevEnd, m.start() ); prevEnd = m.end(); // if you change this expression, change matching one below if ( ( including || insideBody ) && !insideGenerated && !insideSee && chunk.length() > 1 ) { sb.append( repairAcronymsForChunk( chunk, fileBeingProcessed ) ); } else { // leave text as is sb.append( chunk ); } final String boundary = m.group( 1 ); // out.println( "{" + boundary + "}" ); sb.append( boundary ); // find out characteristics of next chunk if ( boundary.startsWith( " so status does not change } else if ( boundary.equals( "" ) ) { insideBody = false; } else if ( boundary.startsWith( "" ) ) { insideGenerated = true; } else if ( boundary.startsWith( " . no status change. } else if ( boundary.startsWith( "<" ) ) { // we have passed the whole tag, but not a possible matching end tag. No status change. } else { err.println( "Strange HTML syntax: unrecognised boundary " + boundary ); return originalPage; } } // end while // we may have a final chunk at the end without a terminating boundary. final String chunk = originalPage.substring( prevEnd, originalPage.length() ); // if you change this expression, change matching one above if ( ( including || insideBody ) && !insideGenerated && !insideSee && chunk.length() > 1 ) { sb.append( repairAcronymsForChunk( chunk, fileBeingProcessed ) ); } else { // leave text as is sb.append( chunk ); } // out.println( "chunks: " + sb.used() ); return sb.toString(); } @SuppressWarnings( { "ResultOfMethodCallIgnored" } ) public static void main( String[] args ) throws IOException { Global.installConfiguration( new ConfigurationForMindprod() ); AcronymItem.fireup(); if ( DEBUGGING ) { out.println( CHUNK_BOUNDARY_PATTERN.toString() ); } repairAcronymsForAllFiles( args ); out.println( "done" ); System.exit( 0 ); } }