/* * [DeDup.java] * * Summary: Removes adjacent duplicate lines from a text file. * * Copyright: (c) 2002-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2002-01-01 initial * 1.1 2002-01-01 allow multiple files on the command line. * trim leading and trailing blank lines. * ensures consistent use of \r\n on Windows, or equivalent for platform. * ensures file ends with exactly one \r\n * 1.2 2005-07-16 add Javadoc * 1.3 2005-07-27 add more bad extensions. * 1.4 2006-03-05 reformat with IntelliJ, add Javadoc. * 1.5 2007-06-24 add pad, icon. */ package com.mindprod.dedup; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.hunkio.HunkIO; import java.awt.Toolkit; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import static java.lang.System.*; /** * Removes adjacent duplicate lines from a text file. *

* Trims trailing blanks on each line. * Trims leading and trailing blank lines. * If nothing changed, file date will not be disturbed. * Case sensitive compare, Only compares adjacent lines. Does not sort the file * first. * converts all Unix, DOS, or Mac line terminators to the platform style. *

* usage: java com.mindprod.dedup.DeDup MySource.txt another.txt * or with JET: * dedup.exe MySource.txt another.txt * * @author Roedy Green, Canadian Mind Products * @version 1.5 2007-06-24 add pad, icon. * @since 2002-01-01 */ public final class DeDup { private static final int FIRST_COPYRIGHT_YEAR = 2002; private static final String RELEASE_DATE = "2007-06-24"; private static final String TITLE_STRING = "DeDup"; private static final String VERSION_STRING = "1.5"; /** * don't need undisplayed copyright notice, since have banner. */ private static final String[] badExtensions = { "ans", "asm", "bat", "batfrag", "blk", "bmp", "bod", "btm", "btmfrag", "c", "cfrag", "class", "cmd", "com", "cpp", "cppfrag", "css", "cssfrag", "csv", "csvfrag", "dat", "dll", "doc", "e", "exe", "gif", "h", "hfrag", "hpp", "hppfrag", "htm", "html", "htmlfrag", "ico", "ih", "ini", "jar", "java", "javafrag", "jnlp", "jnlpfrag", "jpg", "jsp", "jspfrag", "mac", "mbx", "mft", "obj", "p7b", "pas", "png", "policy", "prn", "properties", "ps", "rh", "seq", "ser", "sh", "site", "so", "sql", "sqlfrag", "sym", "tab", "toc", "use", "usg", "wiki", "xml", "xmlfrag", "zip", }; /** * extensions known safe to run DeDup on. */ private static final String[] goodExtensions = { "ctl", "list", "log", "lst", "txt", }; /** * output "after" file name, the temporary, later renamed to match the input */ static String outFilename; /** * which line end convention do we use */ static boolean unix = false; /** * input "before" file */ private static File inFile; /** * input "before" file name */ private static String inFilename; /** * input "before" reader */ private static BufferedReader inReader; /** * output "after" file */ private static File outFile; /** * output "after" file writer */ private static PrintWriter outWriter; /** * display a banner about the author */ private static void banner() { /* Usually not displayed, just embedded. */ out.println( TITLE_STRING + " " + VERSION_STRING + "\n" + "\nFreeware to remove adjacent duplicate lines." + "\nCopyright: (c) 2002-2017 Roedy Green, Canadian Mind Products" + "\n#101 - 2536 Wark Street, Victoria, BC Canada V8T 4G8" + "\nTelephone: (250) 361-9093 Internet:roedyg@mindprod.com" + "\nMay be used freely for non-military use only\n" + "released: " + RELEASE_DATE + "\n\n" + "Do not confuse with com.mindprod.csv.CSVDeDup.\n\n" ); } // end banner /** * Ask user to confirm that some action is ok. * * @param prompt Question to ask the user. * * @return true if the user answers, yes it is ok to proceed. Should redo this with a modal dialog so don't have to * hit Y enter. */ private static boolean confirm( String prompt ) { /* just give a warning */ out.print( prompt ); out.print( " (Y)es (N)o " ); while ( true ) {/* loop forever till user enters Y or N */ honk(); int response = '\033';// default esc try { // read single keystroke, even though user has to hit enter. response = System.in.read();// the console is FileInputReader } catch ( IOException e ) { } response = Character.toUpperCase( ( char ) response ); switch ( response ) { case 'Y': out.println( " Yes" ); return true; case 'N': out.println( " No" ); return false; /* others, keep looping */ } // end switch } // end while } // end confirm /** * Guts of the class. This is the dedup logic. copy inReader to outWriter, processing tabs and line ends Presume * files already open. Does not close them. * * @throws java.io.IOException if cannot reaod/write file */ private static void deDupFile() throws IOException { String prevLine = null; String thisLine; boolean inLeading = true; boolean pendingBlankLine = false; while ( ( thisLine = inReader.readLine() ) != null ) { thisLine = ST.trimTrailing( thisLine ); if ( thisLine.length() == 0 ) { pendingBlankLine = true; } else if ( !thisLine.equals( prevLine ) ) { // deal first with and pending blank lines if ( inLeading ) { // ignore leading blank lines. inLeading = false; pendingBlankLine = false; } else { if ( pendingBlankLine ) { // emit just one embedded blank line, collapse dup blank // lines. outWriter.println(); pendingBlankLine = false; } } // deal with the unique line outWriter.println( thisLine ); prevLine = thisLine; } }/* end while */ // fall out the end with pendingBlankLine we just totally ignore. // that is how we trim trailing blanks. } // end deDupFile /** * abort the run, clean up as best as possible. */ private static void die() { honk(); try { if ( inReader != null ) { inReader.close(); } if ( outWriter != null ) { outWriter.close(); } } catch ( IOException e ) { } System.exit( 1 );/* exit with errorlevel = 1 */ } // end die /** * make sure the filename we are about to process has a safe extension. */ private static void ensureSafeFilename() { /* * Ensure appropriate file name extensions. good =.txt etc - done * without prompt bad =.exe etc. - abort warning =.doc & others - ask */ String extension = ""; int whereDot = inFilename.lastIndexOf( '.' ); if ( whereDot >= 0 && whereDot <= inFilename.length() - 2 ) { extension = inFilename.substring( whereDot + 1 ); } for ( final String goodExtension : goodExtensions ) { if ( extension.equalsIgnoreCase( goodExtension ) ) {/* match, it is Good */ return; } } for ( final String badExtension : badExtensions ) { if ( extension.equalsIgnoreCase( badExtension ) ) {/* match, it is bad */ inFile = null; return; } } /* just give a warning */ if ( !confirm( "\n Warning!\n" + " DeDup is not usually used on files such as " + inFilename + ".\n" + " Do you want to dedup anyway?" ) ) { inFile = null; } } // end ensureSafeFilename /** * make a noise */ private static void honk() { Toolkit.getDefaultToolkit().beep(); } // end honk /** * open the input "before" file */ private static void openInReader() { try { inFile = new File( inFilename ); if ( !inFile.exists() ) { banner(); out.print( "Oops! Cannot find file " ); out.println( inFilename ); die(); } // ignore directories, usually put there by wildcard expansion. if ( inFile.isDirectory() ) { inFile = null; // keep going return; } if ( !inFile.canRead() ) { banner(); out.print( "Oops! no permission to read (i.e. examine) the file " ); out.println( inFilename ); die(); } if ( !inFile.canWrite() ) { // canWrite true implies file exists. banner(); out.print( "Oops! no permission to write (i.e. change) the file " ); out.println( inFilename ); die(); } inReader = new BufferedReader( new FileReader( inFile ), 4 * 1024 /* buffsize */ ); } catch ( FileNotFoundException e ) { banner(); out.print( "Oops! Cannot open file " ); out.println( inFilename ); die(); } } // end openInReader /** * open the output "after" file */ private static void openOutWriter() { try { // get a temporary file in the same directory as inFile. // outFile = getTempFile("DeDup", inFile); outFile = File.createTempFile( "dedup", "tmp", inFile .getParentFile() ); outWriter = EIO.getPrintWriter( outFile, 64 * 1024, EIO.UTF8 ); } catch ( IOException e ) { out.println( "Oops! Cannot create the temporary work file\n" ); die(); } } // end OpenOutWriter /** * Command line utility to remove adjacent duplicate lines. * * @param args list of filenames to dedup. */ public static void main( String[] args ) { try { // process each file on command line, or expanded wild card. for ( final String arg : args ) { inFilename = arg; openInReader();/* Open input "before" file. */ /* Make sure file exists before */ /* song and dance about extension. */ if ( inFile == null ) { /* ignore */ out.println( "- " + inFilename + " : could not open. Directory or unreadable file" ); continue; } ensureSafeFilename();/* make sure filename has sane extension */ if ( inFile == null ) { /* ignore */ out.println( "- " + inFilename + " : bypassed based on extension" ); continue; } openOutWriter();/* open output "after" file */ /* * copy inReader to outWriter removing duplicate lines, trailing * spaces, and lead/trailing blank lines */ deDupFile(); /* * if we trimmed, changed line ends, removed dups, file size * should change. In a pathological case it would not, but then * we do no damage. */ inReader.close(); outWriter.close(); if ( inFile.length() == outFile.length() ) { // nothing changed out.println( "- " + inFilename + " : contained no duplicate lines. Left as is." ); } else { // file really did change. out.println( "* " + inFilename + " : changed!" ); HunkIO.deleteAndRename( outFile, inFile ); } } // end for } catch ( IOException e ) { out.print( "Oops! IO failure. e.g. cannot find file.\n" ); die(); } } // end main }