/*
* [CSVSort.java]
*
* Summary: Sort a CSV file, on multiple columns with case-sensitive, case-insensitive and numeric fields.
*
* Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2002-03-27 initial release
* 1.1 2002-03-28 close
* configurable separator char
* no longer sensitive to line-ending convention.
* uses a categorise routine to message categories for use in case clauses.
* faster skipToNextLine
* 1.2 2002-04-17 put in to separate package
* 1.3 2002-04-17
* 1.4 2002-04-19 fix bug if last field on line is empty, was not counting as a field.
* 1.5 2002-04-19
* 1.6 2002-05-25 allow choice of " or ' quote char.
* 1.7 2002-08-29 getAllFieldsInLine
* 1.8 2002-11-12 allow Microsoft Excel format fields that can span several lines. sponsored by Steve Hunter of
* agilense.com
* 1.9 2002-11-14 trim parameter to control whether fields are trimmed of lead/trail whitespace (blanks, Cr, Lf,
* Tab etc.)
* 2.0 2003-08-10 getInt, getLong, getFloat, getDouble
* 2.1 2005-07-16 reorganisation, new bat files.
* 2.2 2005-08-28 add CSVAlign and CSVPack to the suite.
* 2.3 2005-08-28 add CSVAlign and CSVPack to the suite.
* Use java com.mindprod.CSVAlign somefile.csv
* 2.4 2007-05-20 add icon and PAD
* 2.5 2007-11-27 tidy comments
* 2.6 2008-02-20 IntelliJ inspector, spell corrections, tightening code.
* 2.7 2008-05-28 add CSVTab2Comma.
* 2.8 2008-06-04 add CSVWriter put for various primitives.
* 2.9 2009-03-27 refactor using enums, support comments.
* major rewrite. Now supports #-style
* comments. More efficient RAM use. You can configure the
* separator character, quote character and comment character.
* You can read seeing or hiding the comments. The API was
* changed to support comments.
* 3.0 2009-06-15 lookup table to speed CSVReader
* 3.1 2009-12-03 add CSVSort
* 3.2 2010-02-23 add hex sort 9x+ option to CSVSort
* 3.3 2010-11-14 change default to no comments in input file for CSVTab2Comma.
* 3.4 2010-12-03 add CSV2SRS
* 3.5 2010-12-11 add CSVReshape
* 3.6 2010-12-14 add LinesToCSV
* 3.7 2010-12-17 fix bug in descending sorts.
* 3.8 2011-01-25 allow you to specify encoding
* 3.9 2011-02-08 add support for sorting by field length.
* 4.0 2011-03-10 allow numeric sort on empty columns.
* 4.1 2012-01-25 redefine meaning of n option and add d option. You may need to change some n to d in your scripts.
* 4.2 2012-02-07 treat empty numeric columns as 0.
* 4.3 2012-02-11 add -f for sorting by family name, i.e. surname/given name.
* 4.4 2012-02-15 refine algorithm for sorting by family name.
* 4.5 2012-02-20 fix bug. -d option was being improperly refused.
* 4.6 2012-07-31 fix sort on surname van Pelt, Mc Mac
* 4.7 2014-05-14 allow $ as sort type
*/
package com.mindprod.csv;
import com.mindprod.common18.EIO;
import com.mindprod.common18.FullName;
import com.mindprod.common18.ST;
import com.mindprod.hunkio.HunkIO;
import org.jetbrains.annotations.NotNull;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import static java.lang.System.*;
/**
* Sort a CSV file, on multiple columns with case-sensitive, case-insensitive and numeric fields.
*
* Use: java.exe com.mindprod.CSVSort somefile.csv 0s+ 2i- 5n+ 7d+ 3x+ 4l+
* 0-based list of column numbers, s=string i=case-insensitive string, d=(double,float) n=number(int,
* long) x=hex l=length of field +=ascending -=descending
* Empty cols are presumed to contain "" or "0".
*
* This program has an unusual genesis. On 2009-12-03 I had a detailed dream about documenting
* and writing a sort for CSV files. When I woke up I was surprised at how logical the dream was.
* Usually my computer dreams, on waking, turn out to be complete nonsense. I wrote the program
* just as I planned it in the dream. All went unusually smoothly. About the only change was provision
* for ascending and descending keys.
*
* @author Roedy Green, Canadian Mind Products
* @version 4.7 2014-05-14 allow $ as sort type
* @since 2009-12-03
*/
public final class CSVSort
{
// declarations
/**
* true if want extra debug output
*/
private static final boolean DEBUGGING = false;
/**
* how to use the command line
*/
private static final String USAGE = "\nUsage: csvsort.jar somefile.csv 0s+ 2i- 7d+ 6$+ 5n+ 3x+ 4l+ 8f- .." +
".\n"
+ "s=case-sensitive string i=case-insensitive string, d=(double," +
"float) $=dollar n=number(int,long) x=hex (\\u 0x ff) l=length of field f=family name";
/**
* true if sort on col is ascending
*/
private final boolean[] isAscendings;
/**
* type of sort for each column
*/
private final char[] sortTypes;
/**
* how many columns to sort
*/
private final int countOfColsToSort;
/**
* which 0-based cols to sort in order
*/
private final int[] sortCols;
/**
* which line number we are parsing
*/
private int lineNumber;
// /declarations
// methods
/**
* Sort a CSV file, constructor. Just create. There are no methods to call.
*
* @param fileBeingProcessed CSV file to be packed to remove excess space and quotes.
* @param sortCols array of 0-based cols to sort on.
* @param sortTypes array of chars with letters s i n x to tell how to sort each column.
* @param isAscendings array of sort direction, true=ascending, false = descending.
* @param separatorChar field separator character, usually ',' in North America,
* ';' in Europe and sometimes '\t' for
* tab.
* @param quoteChar char to use to enclose fields containing a separator, usually '\"'. Use (char)0 if
* you don't want a quote character.
* @param commentChar char to use to introduce comments. Use (char) 0 if none. Only one character allowed.
* @param encoding encoding for input and output.
*
* @throws java.io.IOException if problems reading/writing file
*/
@SuppressWarnings( { "WeakerAccess" } )
public CSVSort( final File fileBeingProcessed,
final int[] sortCols,
final char[] sortTypes,
final boolean[] isAscendings,
final char separatorChar,
final char quoteChar,
final char commentChar,
final Charset encoding ) throws IOException
{
this.countOfColsToSort = sortCols.length;
this.sortCols = sortCols;
this.sortTypes = sortTypes;
this.isAscendings = isAscendings;
final String commentChars = ( commentChar == 0 ) ? "" : String.valueOf( commentChar );
final int estimatedLines = ( int ) fileBeingProcessed.length() / 50;
final ArrayList allRows = new ArrayList<>(
estimatedLines );
final ArrayList pendingComments = new ArrayList<>( 11 );
final ArrayList topComments = new ArrayList<>( 11 );
// reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, trimUnquoted,
// allowMultipleLineFields
final CSVReader r = new CSVReader( EIO.getBufferedReader( fileBeingProcessed, 64 * 1024, encoding ),
separatorChar, quoteChar, commentChars,
false, true /* trimQuoted */, true /* trimUnquoted */, true
);
// read everything into allRows ArrayList
lineNumber = 0;
// are we parsing comments at the top of the file?, which are frozen there,
// rather than following the following data record.
boolean inTopComments = true;
try
{
while ( true )
{
String[] fields = r.getAllFieldsInLine();
lineNumber++;
if ( r.wasComment() )
{
if ( fields.length == 1 )
{
// was comment on a line by itself.
// save them up to prepend to next real record.
if ( inTopComments )
{
topComments.add( fields[ 0 ] );
}
else
{
pendingComments.add( fields[ 0 ] );
}
}
else
{
// just a tail comment
inTopComments = false;
final String[] leadComments;
if ( pendingComments.size() > 0 )
{
leadComments = pendingComments.toArray( new String[ pendingComments.size() ] );
pendingComments.clear();
}
else
{
leadComments = new String[ 0 ];
}
final String[] dataFields = new String[ fields.length - 1 ];
System.arraycopy( fields, 0, dataFields, 0, fields.length - 1 );
allRows.add( new SortableCSVRow( dataFields, leadComments, fields[ fields.length - 1 ] ) );
}
}
else
{
// no comment on tail of this data line.
inTopComments = false;
final String[] leadComments;
if ( pendingComments.size() > 0 )
{
leadComments = pendingComments.toArray( new String[ pendingComments.size() ] );
pendingComments.clear();
}
else
{
leadComments = new String[ 0 ];
}
allRows.add( new SortableCSVRow( fields, leadComments, null ) );
}
}
}
catch ( EOFException e )
{
r.close();
}
// pendingComments might contain footer comments.
// in-ram sort
Collections.sort( allRows );
final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed );
// writer, quoteLevel, separatorChar, quoteChar, commentChar, trim
final PrintWriter pw = EIO.getPrintWriter( tempFile, 32 * 1024, encoding );
final CSVWriter w = new CSVWriter( pw, 0 /* minimal */, separatorChar, quoteChar, commentChar, true );
for ( String topComment : topComments )
{
w.nl( topComment );
}
for ( SortableCSVRow sr : allRows )
{
sr.emit( w ); // also outputs embedded comments.
}
for ( String footComment : pendingComments )
{
w.nl( footComment );
}
out.println( lineNumber + " lines sorted" );
w.close(); // closes pw indirectly
HunkIO.deleteAndRename( tempFile, fileBeingProcessed );
}// /method
/**
* extract surname from field, might have multiple names, lead given name.
*
* @param name complete name
*
* @return Surname
*/
private static String extractSurname( String name )
{
name = name.toLowerCase();
// could be a comma separated list or and-separated or &-separated.
int p = name.indexOf( ',' );
if ( p >= 0 )
{
name = name.substring( 0, p );
}
p = name.indexOf( '&' );
if ( p >= 0 )
{
name = name.substring( 0, p );
}
p = name.indexOf( " and " );
if ( p >= 0 )
{
name = name.substring( 0, p );
}
p = name.indexOf( " with " );
if ( p >= 0 )
{
name = name.substring( 0, p );
}
name = name.trim();
// trim off trailing [author] etc.
int len = name.length();
if ( len > 0 && name.charAt( len - 1 ) == ']' )
{
p = name.lastIndexOf( '[' );
if ( p >= 0 )
{
name = name.substring( 0, p );
name = name.trim();
}
}
return new FullName( name ).getSortable();
}// /method
/**
* Simple command line interface to CSVSort, Sorts one csv file whose name is on the command line. Must have
* extension .csv and sort cols of form: 0s+ 2i- 4n+ 5x+
*
* @param args name of csv file to sort, followed by cols to sort.
*/
public static void main( String[] args )
{
if ( args.length < 2 )
{
throw new IllegalArgumentException( USAGE );
}
String filename = args[ 0 ];
if ( !filename.endsWith( ".csv" ) )
{
throw new IllegalArgumentException(
"Bad Extension\n" + USAGE );
}
final File file = new File( filename );
final int keyCount = args.length - 1;
final char[] sortTypes = new char[ keyCount ];
final int[] sortCols = new int[ keyCount ];
final boolean[] isAscendings = new boolean[ keyCount ];
for ( int i = 0; i < keyCount; i++ )
{
// skip first parm
final String arg = args[ i + 1 ];
if ( !( 3 <= arg.length() && arg.length() <= 4 ) )
{
// must have form 9s+ or 99s+
throw new IllegalArgumentException(
"bad sort col\n" + USAGE );
}
sortCols[ i ] = Integer.parseInt( arg.substring( 0, arg.length() - 2 ) );
final char charSortType = arg.charAt( arg.length() - 2 );
if ( !ST.isLegal( charSortType, "sid$nxlf" ) )
{
throw new IllegalArgumentException(
"Sort column letter must be s/i/d/$/n/x/l/f\n" + USAGE );
}
sortTypes[ i ] = charSortType;
final char direction = arg.charAt( arg.length() - 1 );
if ( !ST.isLegal( direction, "+-" ) )
{
throw new IllegalArgumentException(
"Sort column letter direction must be +/-\n" + USAGE );
}
isAscendings[ i ] = direction == '+';
}
try
{
// file, cols, types, directions, separatorChar, quoteChar, commentChar
new CSVSort( file, sortCols, sortTypes, isAscendings, ',', '\"', '#', CSV.UTF8 );
}
catch ( IOException e )
{
err.println();
e.printStackTrace( err );
err.println( "CSVSort failed to sort " + EIO.getCanOrAbsPath( file ) );
err.println();
}
}// /method
// /methods
// inner class
class SortableCSVRow implements Comparable
{
/**
* data fields of the row
*/
private final String[] fields;
/**
* comment lines prior to the this row, without lead #
*/
private final String[] leadComments;
/**
* comment line on tail ent of line. null if so such comment, without lead #
*/
private final String tailComment;
/**
* key fields we sort by, String (si) or Double objects (n) or Long object (x)
* Not directly Comparable. Cannot use Comparable since no common T.
*/
private final Comparable[] keys;
/**
* constructor
*
* @param fields List of fields to sort on
* @param leadComments comment lines prior to the this row, without lead #
* @param tailComment line on tail ent of line. null if so such comment, without lead #
*/
SortableCSVRow( String[] fields, String[] leadComments, String tailComment )
{
this.fields = fields;
this.leadComments = leadComments;
this.tailComment = tailComment;
// build list of keys wi will sort on in convenient form.
// arrays and generics do not mix so cannot have Comparable