/* * [QEV.java] * * Summary: Quote Entity Validator ensures single and double entities are balanced. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2009-11-17 initial version * 1.1 2011-01-12 add -guess, -ignore, -strict -British command line options. */ package com.mindprod.qev; import com.mindprod.commandline.CommandLine; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.AllButSVNDirectoriesFilter; import com.mindprod.filter.ExtensionListFilter; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Quote Entity Validator ensures single and double entities are balanced. *

* * @author Roedy Green, Canadian Mind Products * @version 1.1 2011-01-12 add -guess, -ignore, -strict -British command line options. * @noinspection WeakerAccess * @since 2009-11-17 */ public class QEV { private static final int FIRST_COPYRIGHT_YEAR = 2009; /** * undisplayed copyright notice. * * @noinspection UnusedDeclaration */ private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; /** * date this version released. * * @noinspection UnusedDeclaration */ private static final String RELEASE_DATE = "2011-01-12"; /** * how to use the command line */ private static final String USAGE = "\nQEV needs optional switches -guess, -ignore, " + "-strict -British then a filename\n" + "or a space-separated list of filenames, with optional -s -q -v switches."; /** * embedded version string. * * @noinspection UnusedDeclaration */ private static final String VERSION_STRING = "1.1"; /** * Regex to find first word of string */ private static final Pattern FIRST_WORD = Pattern.compile( "[\\p{Alnum}]+" ); /** * Regex to test for digits followed by an s */ private static final Pattern NNNS = Pattern.compile( "\\d+s" ); /** * count of how many errors detected. */ private static int errorCount; /** * has this file name already been displayed */ private static boolean filenameAlreadyDisplayed; /** * count of how many files with errors detected. */ private static int filesWithErrorsCount; /** * constructor, not used. * * @noinspection WeakerAccess */ private QEV() { } /** * get first word of this string * * @param s to first first word of * * @return true if pattern 99...99s */ private static String firstWord( String s ) { final Matcher m = FIRST_WORD.matcher( s ); if ( m.lookingAt() ) { return m.group( 0 ); } else { return " "; } } /** * display the file we are proofreading, but only if there is an error in the file * * @param fileBeingProcessed file being proofread */ private static void identifyFile( File fileBeingProcessed ) { errorCount++; if ( !filenameAlreadyDisplayed ) { filesWithErrorsCount++; err.println( "----------- " + fileBeingProcessed + " -----------" ); filenameAlreadyDisplayed = true; } } /** * is this ” we just hit being used as true right double quote? * * @param col 0-based col where & found * @param line text of line where ’ found * * @return true if being used as as right double quote */ private static boolean isTrueRightDoubleQuote( final int col, final String line ) { final char prevChar = ( col > 0 ) ? line.charAt( col - 1 ) : ' '; // start of line final char nextChar = ( col + 7 < line.length() ) ? line.charAt( col + 7 ) : ' '; // end of line return !( ST.isDigit( prevChar ) && !ST.isUnaccentedLetter( nextChar ) ); // false: 12.01" 12.0" 1 ) ? line.charAt( col - 2 ) : ' '; // start of line final char prevChar = ( col > 0 ) ? line.charAt( col - 1 ) : ' '; // start of line final char nextChar = ( col + 7 < line.length() ) ? line.charAt( col + 7 ) : ' '; // end of line // false pickin' whales' 's Pete's isn't 1.5's 12.01' 12.0'' && nextWord.equals( "s" ) || ( prevChar == 'd' || prevChar == 'l' || prevChar == 'L' ) && line.startsWith( "é", col + 7 ) || prevChar == 'n' && !inSingle || prevChar == 's' && !inSingle || prevWord.equalsIgnoreCase( "Jr." ) && nextWord.equals( "s" ) || prevWord.equalsIgnoreCase( "Sr." ) && nextWord.equals( "s" ) || prevWord.endsWith( "é" ) && nextWord.equals( "s" ) || ST.isDigit( prevChar ) && ( nextChar == 's' || nextChar == ' ' || nextChar == '<' ) || ST.isUnaccentedLetter( prevChar ) && ST.isUnaccentedLetter( nextChar ) ); } /** * Check if word is digits followed by s eg. 1990s * * @param word word to test * * @return true if pattern 99...99s */ private static boolean nnns( String word ) { final Matcher m = NNNS.matcher( word ); return m.matches(); } /** * get row and col in 1-based display format * * @param row 0-based line number * @param col 0-based column * * @return string with 1-based row and col */ private static String rowCol( final int row, final int col ) { return ( row + 1 ) + ":" + ( col + 1 ); } /** * On System.err display line pointing to the problem column * * @param row 0-based line number * @param col 0-based column * @param line text of line to display */ private static void showLine( final int row, final int col, final String line ) { err.print( ST.leftPad( Integer.toString( row + 1 ), 8, false ) + " :" ); err.println( line ); err.print( ST.spaces( 10 + col ) ); // left indent + space over to problem area err.println( "^ :" + ( col + 1 ) ); } /** * Quote Entity Validator ensures single and double entities are balanced in HTML files. * * @param args options -guess -ignore -strict then names of files to process, dirs, files, -s, *.*, no wildcards. */ public static void main( String[] args ) { // gather all the files mentioned on the command line. // either directories, files, with -s and subdirs option. // warning. Windows expands any wildcards in a nasty way. // do not use wildcards. // See http://mindprod.com/jgloss/wildcard.html CheckTicks checkTicks = CheckTicks.GUESS; boolean british = false; // process -all -ignore -some switches. for ( int i = 0; i < args.length; i++ ) { if ( args[ i ].startsWith( "-" ) ) { if ( args[ i ].toLowerCase().equals( "-british" ) ) { british = true; } else { try { checkTicks = CheckTicks.valueOf( args[ i ].substring( 1 ).toUpperCase() ); args[ i ] = null; // suppress CommandLine seeing it. } catch ( IllegalArgumentException e ) { /* unrecognised options -s -q -v are ok. They will not change the value of checkTicks */ } } } } out.println( "Gathering html files to validate..." ); CommandLine commandLine = new CommandLine( args, new AllButSVNDirectoriesFilter(), new ExtensionListFilter( "html" ) ); if ( commandLine.size() == 0 ) { throw new IllegalArgumentException( "No files found to process\n" + USAGE ); } for ( File file : commandLine ) { try { validateFile( file, checkTicks, british ); } catch ( FileNotFoundException e ) { final FastCat sb = new FastCat( 3 ); sb.append( "Error: " ); sb.append( EIO.getCanOrAbsPath( file ) ); sb.append( " was deleted." ); err.println( sb.toString() ); } catch ( Exception e ) { final FastCat sb = new FastCat( 3 ); sb.append( e.getMessage() ); sb.append( " in file " ); sb.append( EIO.getCanOrAbsPath( file ) ); err.println( sb.toString() ); } } // end for err.println( errorCount + " balancing errors detected in " + filesWithErrorsCount + " files." ); } // end main /** * verify quote entities in one file. * * @param fileBeingProcessed File to verify * @param checkTicks how to handle ' * @param isBritish true if use British convention of double quotes nested inside single. * * @throws IOException if problem reading the file * @noinspection SameParameterValue, WeakerAccess */ public static void validateFile( final File fileBeingProcessed, final CheckTicks checkTicks, final boolean isBritish ) throws IOException { final String innerStartEntity; final String innerEndEntity; final String outerStartEntity; final String outerEndEntity; if ( isBritish ) { innerStartEntity = "ldquo;"; innerEndEntity = "rdquo;"; outerStartEntity = "lsquo;"; outerEndEntity = "rsquo;"; } else { innerStartEntity = "lsquo;"; innerEndEntity = "rsquo;"; outerStartEntity = "ldquo;"; outerEndEntity = "rdquo;"; } // O P E N final BufferedReader br = EIO.getBufferedReader( fileBeingProcessed, 100 * 1024, EIO.UTF8 ); // current line in file String line; // are we inside “ boolean inOuter = false; // are we inside ‘ boolean inInner = false; // row where last saw “ int outerStartRow = 0; // col where last saw “ int outerStartCol = 0; // text of line where last saw “ String outerStartLine = null; // row where last saw ‘ int innerStartRow = 0; // col where last saw ‘ int innerStartCol = 0; // text of line where last saw ‘ String innerStartLine = null; // row we are processing, 0-based int row = 0; // col we are processing 0-based int col; filenameAlreadyDisplayed = false; // we treat comments and text alike. while ( ( line = br.readLine() ) != null ) { // process all the quote entities on this line col = 0; while ( col < line.length() ) { col = line.indexOf( '&', col ); if ( col < 0 ) { break; } else if ( line.startsWith( outerStartEntity, col + 1 ) ) { if ( inOuter ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 10 ); sb.append( "Missing &" ); sb.append( outerEndEntity ); sb.append( " after &" ); sb.append( outerStartEntity ); sb.append( "; at " ); sb.append( rowCol( outerStartRow, outerStartCol ) ); sb.append( " before &" ); sb.append( outerStartEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( outerStartRow, outerStartCol, outerStartLine ); showLine( row, col, line ); } if ( inInner ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 11 ); sb.append( inOuter ? " Or missing" : " Missing" ); sb.append( " &" ); sb.append( innerEndEntity ); sb.append( " after " ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( innerStartRow, innerStartCol ) ); sb.append( " before &" ); sb.append( outerEndEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( innerStartRow, innerStartCol, innerStartLine ); showLine( row, col, line ); inInner = false; } inOuter = true; outerStartRow = row; outerStartCol = col; outerStartLine = line; col += 6; } else if ( line.startsWith( outerEndEntity, col + 1 ) ) { if ( checkTicks == CheckTicks.STRICT || isBritish ? isTrueRightSingleQuote( col, line, inOuter ) : isTrueRightDoubleQuote( col, line ) ) { if ( !inOuter ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 6 ); sb.append( "Missing &" ); sb.append( outerStartEntity ); sb.append( " before &" ); sb.append( outerEndEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( row, col, line ); } if ( inInner ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 11 ); sb.append( !inOuter ? " Or missing" : " Missing" ); sb.append( " &" ); sb.append( innerEndEntity ); sb.append( " after &" ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( innerStartRow, innerStartCol ) ); sb.append( " before &" ); sb.append( outerEndEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( innerStartRow, innerStartCol, innerStartLine ); showLine( row, col, line ); inInner = false; } inOuter = false; col += 6; } else { // sometimes &"+outerEndEntity+" is used as an for degree or minutes. Just ignore it. col += 6; } } else if ( checkTicks != CheckTicks.IGNORE && line.startsWith( innerStartEntity, col + 1 ) ) { if ( !inOuter ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 6 ); sb.append( "Missing &" ); sb.append( outerStartEntity ); sb.append( " before &" ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( row, col, line ); } if ( inInner ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 11 ); sb.append( !inOuter ? " Or missing" : " Missing" ); sb.append( " &" ); sb.append( innerEndEntity ); sb.append( " after &" ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( innerStartRow, innerStartCol ) ); sb.append( " before &" ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( innerStartRow, innerStartCol, innerStartLine ); showLine( row, col, line ); } inInner = true; innerStartRow = row; innerStartCol = col; innerStartLine = line; col += 6; } else if ( checkTicks != CheckTicks.IGNORE && line.startsWith( innerEndEntity, col + 1 ) ) { if ( checkTicks == CheckTicks.STRICT || isBritish ? isTrueRightDoubleQuote( col, line ) : isTrueRightSingleQuote( col, line, inInner ) ) { if ( !inInner ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 6 ); sb.append( "Missing &" ); sb.append( innerStartEntity ); sb.append( " before &" ); sb.append( innerEndEntity ); sb.append( " at " ); sb.append( rowCol( row, col ) ); err.println( sb.toString() ); showLine( row, col, line ); } inInner = false; col += 6; } else { // sometimes ’ is used as an apostrophe. Just ignore it. // e.g. isn't or people's or whales' or pickin' or 's col += 6; } } else { // some other entity col += 1; } } // end process entities in line loop. row++; } // end read line loop if ( inOuter ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 6 ); sb.append( "Missing &" ); sb.append( outerEndEntity ); sb.append( " after &" ); sb.append( outerStartEntity ); sb.append( " at " ); sb.append( rowCol( outerStartRow, outerStartCol ) ); err.println( sb.toString() ); showLine( outerStartRow, outerStartCol, outerStartLine ); } if ( inInner ) { identifyFile( fileBeingProcessed ); final FastCat sb = new FastCat( 7 ); sb.append( inOuter ? " Or missing" : " Missing" ); sb.append( " &" ); sb.append( innerEndEntity ); sb.append( " after &" ); sb.append( innerStartEntity ); sb.append( " at " ); sb.append( rowCol( innerStartRow, innerStartCol ) ); err.println( sb.toString() ); showLine( innerStartRow, innerStartCol, innerStartLine ); } br.close(); } public enum CheckTicks { GUESS, /* balance ‘ ’ that don't appear to be apostrophes */ IGNORE, /* ignore all ‘ ’ */ STRICT /* strictly balance all ‘ ’ */ } }