/* * [CSVRecode.java] * * Summary: Recode a column of a CSV file with a list of from:to pairs. * * Copyright: (c) 2010-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2010-12-30 initial version * 1.1 2011-01-25 allow you to specify encoding * 1.2 2011-03-08 eliminate all but first duplicate in patch file. allow comments in patch file, * allow 2+ cols in patch file. * 1.3 2011-10-13 fix bug, would get two records if patched two fields. * 1.4 2011-10-16 ignore trailing / when matching * 1.5 2016-05-05 rename from CSVPatch to CSVRecode * 1.6 2016-07-01 add -miss parameter */ package com.mindprod.csv; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.hunkio.HunkIO; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.HashMap; import static java.lang.System.*; /** * Recode a column of a CSV file with a list of from:to pairs. *

* Use: java.exe com.mindprod.CSVRecode somefile.csv fromto.csv 0 3 (columns to patch) * Typically used to update URLs in a column with recently discovered replacements. * Both columns and the files must use a consistent & and &amp; plain/entity convention for & in URLs if * you are replacing URLs in HTML text. * In contrast CSVReplaceURLs modifies HTML text. * * @author Roedy Green, Canadian Mind Products * @version 1.6 2016-07-01 add -miss parameter * @see com.mindprod.csv.CSVReplaceURLs * @since 2010-12-30 */ public final class CSVRecode { /** * how to use the command line */ private static final String USAGE = "\nCSVRecode needs filename_to_patch.csv file_of_from_to_pairs.csv [-miss xx] 0 3 ... " + "(0-based cols to patch)"; private static boolean DEBUGGING = false; /** * patch a CSV fileToPatch, constructor. Just create. There are no methods to call. * * @param fileBeingProcessed CSV fileToPatch to be packed to remove excess space and quotes. * @param fileOfPairs CSV file of pairs from,to (extra cols ignored) * @param separatorChar field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for * tab. * @param quoteChar char to use to enclose fields containing a separator, usually '\"'. Use (char)0 if * you don't want a quote character. * @param commentChar char to use to introduce comments. Use (char) 0 if none. Only one character allowed. * @param encoding encoding of input and output. * @param miss value to use when there is no match. null means leave as is. * @param colsToPatch list of columns that should be patched, 0-based. * * @throws java.io.IOException if problems reading/writing fileToPatch */ @SuppressWarnings( { "WeakerAccess" } ) public CSVRecode( final File fileBeingProcessed, final File fileOfPairs, final char separatorChar, final char quoteChar, final char commentChar, final Charset encoding, final String miss, final int... colsToPatch ) throws IOException { if ( DEBUGGING ) { out.println( "miss: " + miss ); } // hold translate from:to pairs in RAM final HashMap translator = new HashMap<>( 2048 ); final String commentChars = ( commentChar == 0 ) ? "" : String.valueOf( commentChar ); // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, trimUnquoted, // allowMultipleLineFields final CSVReader inPairs = new CSVReader( EIO.getBufferedReader( fileOfPairs, 64 * 1024, encoding ), separatorChar, quoteChar, commentChars, true /* hide comments */, true /* trimQuoted */, true /* trimUnquoted */, true /* allow multi-line */ ); int pairCount; try { while ( true ) { String from = inPairs.get(); // ignore blank line if ( from == null ) { continue; } // we match ignoring trailing / from = ST.trimTrailing( from, '/' ); final String to = inPairs.get(); if ( to == null || from.length() == 0 ) { throw new IllegalArgumentException( "pairs file " + fileOfPairs.toString() + " must contain at " + "least two fields (from,to) on each line" ); } inPairs.skipToNextLine(); final String old = translator.put( from, to ); if ( old != null ) { err.println( "duplicate pair " + from + " -> " + old + " and " + from + " -> " + to ); } } } catch ( EOFException e ) { pairCount = inPairs.lineCount(); inPairs.close(); } // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, allowMultipleLineFields final CSVReader r = new CSVReader( EIO.getBufferedReader( fileBeingProcessed, 64 * 1024, encoding ), separatorChar, quoteChar, commentChars, false /* process comments */, true /* trimQuoted */, true /* trimUnquoted */, true /* allow multi-line */ ); final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed ); // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim final PrintWriter pw = EIO.getPrintWriter( tempFile, 64 * 1024, encoding ); final CSVWriter w = new CSVWriter( pw, 0 /* minimal */, separatorChar, quoteChar, commentChar, true /* trim */ ); int fieldsPatched = 0; try { while ( true ) { final String[] fields = r.getAllFieldsInLine(); final boolean lastFieldWasComment = r.wasComment(); // patch each of the columns requested. for ( int col : colsToPatch ) { if ( col < fields.length ) { // we insist on exact match, except for trailing / final String from = ST.trimTrailing( fields[ col ], '/' ); // currently insists that xxx.html, ?xxx, #xxxx also match. // todo: should not insist on match. // should keep tail if new has none. // should replace tail if new has tail. // match based final String newValue = translator.get( from ); if ( newValue != null ) { fields[ col ] = newValue; fieldsPatched++; } else if ( miss != null ) { // not in lookup table. fields[ col ] = miss; fieldsPatched++; } // else leave old value alone // not in lookup table. // most of the time we make no change. The value won't be in the HashMap. } } w.nl( fields, lastFieldWasComment ); } } catch ( EOFException e ) { out.println( r.lineCount() + " lines read, " + pairCount + " pairs read, " + fieldsPatched + " fields patched, " + w.getLineCount() + " lines written." ); r.close(); w.close(); HunkIO.deleteAndRename( tempFile, fileBeingProcessed ); } } // /method /** * Simple command line interface to CSVRecode. Patches one csv file whose name is on the command line with a list * of from to pairs in another csv file. You specify which columns to translate from to to. * Both files must have the extension .csv * * @param args name of csv file to patch, from-to pairs file, columns to patch. */ public static void main( String[] args ) { if ( args.length < 3 ) { throw new IllegalArgumentException( USAGE ); } final String fileToPatchName = args[ 0 ]; if ( !fileToPatchName.endsWith( ".csv" ) ) { throw new IllegalArgumentException( "Bad filenametopatch Extension\n" + USAGE ); } final String fileOfPatchesName = args[ 1 ]; if ( !fileOfPatchesName.endsWith( ".csv" ) ) { throw new IllegalArgumentException( "Bad fromtopairs Extension\n" + USAGE ); } // filename_to_patch.csv file_of_from_to_pairs.csv [-miss xx] 0 3 (0-based cols to patch)"; String miss = null; int bypass = 2; if ( args.length < 3 ) { throw new IllegalArgumentException( "Not enough parameters\n" + USAGE ); } if ( args[ 2 ].equals( "-miss" ) ) { if ( args.length < 4 ) { throw new IllegalArgumentException( "Not enough parameters\n" + USAGE ); } miss = args[ 3 ]; bypass = 4; } final int[] colsToPatch = new int[ args.length - bypass ]; try { for ( int i = bypass; i < args.length; i++ ) { // we are copying and converting to int. colsToPatch[ i - bypass ] = Integer.parseInt( args[ i ] ); } } catch ( NumberFormatException e ) { throw new IllegalArgumentException( USAGE ); } final File fileToPatch = new File( fileToPatchName ); final File fileOfPatches = new File( fileOfPatchesName ); try { // file, separatorChar, quoteChar, commentChar new CSVRecode( fileToPatch, fileOfPatches, ',', '\"', '#', CSV.UTF8, miss, colsToPatch ); } catch ( IOException e ) { err.println(); err.println( "IO problem: " + e.getMessage() ); e.printStackTrace( err ); err.println( "CSVRecode failed to patch " + EIO.getCanOrAbsPath( fileToPatch ) ); if ( !fileToPatch.canRead() ) { err.println( "Cannot read fileToPatch: " + fileToPatch.getAbsolutePath() ); } if ( !fileToPatch.canWrite() ) { // canWrite true implies file exists. err.println( "Cannot write fileToPatch:" + fileToPatch.getAbsolutePath() ); } if ( !fileOfPatches.canRead() ) { err.println( "Cannot read fileOfPatches: " + fileOfPatches.getAbsolutePath() ); } err.println(); } } // /method }