/* * [Extract.java] * * Summary: extract lines from text files that match given strings, case-sensitive. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2009-02-26 initial release * 1.1 2009-02-27 all switch means all strings must match * where switch means display where each line found file/line # * 1.2 2009-06-03 fix bug, formerly ignored search strings one char long. * 1.3 2014-04-05 fix bug, formerly could not handle more than one regex string. */ package com.mindprod.extract; import com.mindprod.commandline.CommandLine; import com.mindprod.common18.EIO; import com.mindprod.csv.CSVWriter; import com.mindprod.filter.AllButSVNDirectoriesFilter; import com.mindprod.filter.ExtensionListFilter; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * extract lines from text files that match given strings, case-sensitive. Not Regex. * * @author Roedy Green, Canadian Mind Products * @version 1.3 2014-04-05 fix bug, formerly could not handle more than one string. * @since 2009-02-26 */ public class Extract { /** * whacking huge buffer to read the files to scan */ private static final int BUFFSIZE = 100000; private static final int FIRST_COPYRIGHT_YEAR = 2009; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String RELEASE_DATE = "2014-04-05"; /** * how to use the command line */ private static final String USAGE = "\nExtract needs a list of Strings to search for then a dash then a filename or " + "a space-separated list of filenames, with optional -s -q -v switches."; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String VERSION_STRING = "1.3"; /** * extensions to search. Others are ignored. */ private static final String[] EXTENSIONS_TO_SEARCH = { "ans", "asm", "bat", "batfrag", "btm", "btmfrag", "c", "cfrag", "cmd", "cpp", "cppfrag", "css", "cssfrag", "csv", "csvfrag", "ctl", "doc", "dtd", "dtdfrag", "e", "h", "hfrag", "hpp", "hppfrag", "htm", "html", "htmlfrag", "idx", "ih", "ini", "java", "javafrag", "jnlp", "jnlpfrag", "jsp", "jspfrag", "list", "log", "look", "lst", "mac", "mft", "pas", "policy", "pom", "pomflag", "prn", "properties", "ps", "raw", "rh", "sh", "site", "sql", "sqlfrag", "tab", "txt", "use", "wiki", "xml", "xmlfrag", "xsd", "xsdfrag" }; /** * true if all strings and regexes much match to extract the line */ static boolean allMustMatch = false; /** * which regexs we search for */ static Pattern[] regexesToSearchFor; /** * which strings we search for */ static String[] stringsToSearchFor; /** * which files to process, including subdirs etc. */ private static CommandLine commandLine; private static CSVWriter csv; /** * true if should display file and line number in CSV format */ private static boolean showWhereLinesFound = false; /** * Display exception error * * @param e the error exception * @param app the name of the app * @param args arguments from the command line */ static void displayError( final Exception e, String app, String[] args ) { e.printStackTrace( err ); err.println(); err.println( e.getMessage() ); err.println( "Usage: " + app + ".jar -all -where \"apple\" \"pear\" @myregex.txt - somefile1.txt -s " + "somedir" ); err.println( "or: " + app + ".jar -all -where \"apple\" \"pear\" @myregex.txt - somefile1.txt -s somedir" ); err.println( "Jet: " + app + ".exe -all -where \"apple\" \"pear\" @myregex.txt - somefile1.txt -s somedir" ); err.println( "-all = all strings must match, otherwise just one." ); err.println( "-where = display where match found." ); err.println( "@ = optional file containing regexes." ); err.println( "Output on console, ready to redirect." ); err.println( "command line parameters:" ); for ( String arg : args ) { err.println( '[' + arg + ']' ); } err.println(); } /** * parse command line * * @param args strings to search for, then a -, then names of files to process, no wildcards. * search string that start with @ are names of files containing regexes. */ static void parse( String[] args ) { final ArrayList stringsToSearchFor = new ArrayList<>( args.length ); final ArrayList regexesToSearchFor = new ArrayList<>( args.length ); final ArrayList fileParmsToScan = new ArrayList<>( args.length ); boolean inFiles = false; for ( String arg : args ) { if ( arg.equals( "-" ) ) { inFiles = true; } else if ( arg.equals( "-all" ) ) { allMustMatch = true; } else if ( arg.equals( "-where" ) ) { showWhereLinesFound = true; } else if ( inFiles ) { processFileParm( fileParmsToScan, arg ); } else if ( arg.startsWith( "@" ) ) { // we have e regex @-style file name. processRegexParm( regexesToSearchFor, arg ); } else { processStringParm( stringsToSearchFor, arg ); } } // end loop if ( stringsToSearchFor.size() + regexesToSearchFor.size() == 0 ) { throw new IllegalArgumentException( "no Strings or regexes to search for." ); } Extract.stringsToSearchFor = stringsToSearchFor.toArray( new String[ stringsToSearchFor.size() ] ); Extract.regexesToSearchFor = regexesToSearchFor.toArray( new Pattern[ regexesToSearchFor.size() ] ); Extract.commandLine = new CommandLine( fileParmsToScan.toArray( new String[ fileParmsToScan.size() ] ), new AllButSVNDirectoriesFilter(), new ExtensionListFilter( ExtensionListFilter.COMMON_TEXT_EXTENSIONS ) ); if ( commandLine.size() == 0 ) { throw new IllegalArgumentException( "No files found to process\n" + USAGE ); } if ( !inFiles ) { throw new IllegalArgumentException( "no files specified to search. missing -." ); } if ( Extract.commandLine.size() == 0 ) { throw new IllegalArgumentException( "no files specified to search." ); } } /** * process a file name parm on the command line * * @param fileParmsToScan where we accumulate file/dirs to seach for * @param arg current file-type parameter */ private static void processFileParm( final ArrayList fileParmsToScan, final String arg ) { if ( arg.length() == 0 ) { err.println( "Warning: ignoring empty filename." ); } else { fileParmsToScan.add( arg ); } } /** * process a regex string parm on the command line * * @param regexesToSearchFor where we accumulate regexes to seach for * @param arg current string-type parameter, @filename */ private static void processRegexParm( final ArrayList regexesToSearchFor, final String arg ) { if ( arg.length() == 0 ) { err.println( "Warning: ignoring empty regex." ); } else { try { final File regexFile = new File( arg.substring( 1 ) ); // O P E N final FileReader fr = new FileReader( regexFile ); final BufferedReader br = new BufferedReader( fr, 24 * 1024 ); while ( true ) { // R E A D // File being read need not have have a terminal \n. // File being read may safely use any mixture of \r\n, \r or \n line terminators. final String regexString = br.readLine(); // regexString == null means EOF if ( regexString == null ) { break; } // strip out any control characters. StringBuilder sb = new StringBuilder( regexString.length() ); for ( int i = 0; i < regexString.length(); i++ ) { final char c = regexString.charAt( i ); if ( !( 0 <= c && c <= 31 || c == 127 ) ) { sb.append( c ); } } final String cookedRegex = sb.toString(); err.println( EIO.getCanOrAbsPath( regexFile ) + " contains regex: " + cookedRegex ); if ( cookedRegex.length() == 0 ) { err.println( "Warning: ignoring empty regex." ); } else { regexesToSearchFor.add( Pattern.compile( cookedRegex ) ); } } // C L O S E br.close(); } catch ( IOException e ) { throw new IllegalArgumentException( "unable to read regex file: " + arg ); } } } /** * process a search string parm on the command line * * @param stringsToSearchFor where we accumplate strings to seach for * @param arg current string-type parameter */ private static void processStringParm( final ArrayList stringsToSearchFor, final String arg ) { if ( arg.length() == 0 ) { err.println( "Warning: ignoring empty search string." ); } else { stringsToSearchFor.add( arg ); } } /** * Determine if this line is wanted.. * * @param line line from the file * * @return true if this line is wanted, to be printed */ boolean isLineWanted( final String line ) { if ( allMustMatch ) { for ( String s : stringsToSearchFor ) { if ( !line.contains( s ) ) { return false; } } for ( Pattern p : regexesToSearchFor ) { final Matcher m = p.matcher( line ); if ( !m.find() ) { return false; } } return true; } else { for ( String s : stringsToSearchFor ) { if ( line.contains( s ) ) { return true; } } for ( Pattern p : regexesToSearchFor ) { final Matcher m = p.matcher( line ); if ( m.find() ) { return true; } } return false; } } /** * @param fileBeingProcessed the file currently being processed. * * @throws java.io.IOException on read problems. */ @SuppressWarnings( { "SameParameterValue" } ) private void processFile( File fileBeingProcessed ) throws IOException { // O P E N BufferedReader br = EIO.getBufferedReader( fileBeingProcessed, 128 * 1024, EIO.UTF8 ); // R E A D int lineNumber = 0; while ( true ) { final String line = br.readLine(); if ( line == null ) { break; } lineNumber++; if ( isLineWanted( line ) ) { if ( showWhereLinesFound ) { csv.put( EIO.getCanOrAbsPath( fileBeingProcessed ) ); csv.put( lineNumber ); csv.put( line ); csv.nl(); } else { out.println( line ); } } } // C L O S E br.close(); } // processFile /** * Extracts lines in files that contain a given string. */ void processFiles() { if ( showWhereLinesFound ) { // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim // DO NOT CHANGE THIS TO getPrintWriter csv = new CSVWriter( new PrintWriter( new BufferedWriter( new OutputStreamWriter( System.out ) ) ), 0 /* minimal quoting */, ',', '\"', '#', false /* no trim */ ); } for ( File file : commandLine ) { try { processFile( file ); } catch ( FileNotFoundException e ) { throw new IllegalArgumentException( "Error: " + EIO.getCanOrAbsPath( file ) + " not found." ); } catch ( Exception e ) { throw new IllegalArgumentException( e.getMessage() + " in file " + EIO.getCanOrAbsPath( file ) ); } } // end for if ( showWhereLinesFound ) { csv.close(); } } /** * Extracts lines in files that contain a given string. * * @param args strings to search for, then a -, then names of files to process, no wildcards. * strings are case-sensitive, not regexes. * -all switch means all strings must match * -where switch means display where each line found file/line # */ public static void main( String[] args ) { try { parse( args ); new Extract().processFiles(); } catch ( Exception e ) { displayError( e, "extract", args ); } } // end main } // end Extract