/* * [CSVAlign.java] * * Summary: align a CSV File into columns. * * Copyright: (c) 1998-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.9 2000-03-27 refactor using enums, support comments. * 3.0 2009-06-15 lookup table to speed CSVReader * 3.1 2009-12-03 add CSVSort * 3.2 2010-02-23 add hex sort 9x+ option to CSVSort * 3.3 2010-11-14 change default to no comments in input file for CSVTab2Comma. * 3.4 2010-12-03 add CSV2SRS * 3.5 2010-12-11 add CSVReshape * 3.6 2011-01-13 add -v verbose option * 3.7 2011-01-19 also align the ## comment. * 3.8 2011-02-25 fix but when column comment had extra cols. left align date headers. */ package com.mindprod.csv; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.fastcat.FastCat; import com.mindprod.hunkio.HunkIO; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.regex.Pattern; import static java.lang.System.*; /** * align a CSV File into columns. *

* Use: java.exe com.mindprod.CSVAlign somefile.csv * * @author Roedy Green, Canadian Mind Products * @version 3.8 2011-02-25 fix but when column comment had extra cols. left align date headers. * @since 1998 */ public final class CSVAlign { /** * always use windows line separator on any platform since csv is a windows format file. */ private static final String lineSeparator = "\r\n"; /** * how to use the command line */ private static final String USAGE = "\nCSVAlign needs a single filename.csv on the command line with possible -v " + "switch."; /** * used to split Label comment fields apart */ private static final Pattern SPLIT_ON_COMMA = Pattern.compile( "\\s*,\\s*" ); /** * ColumnDescriptors for each column as an ArrayList for pass1 */ private ArrayList colDescriptorList; /** * ColumnDescriptors for each column as an array for pass2 */ private ColumnDescriptor[] colDescriptors; /** * where we accumulate an entire aligned line. Cannot be made local. */ private FastCat sb; /** * The width of an entire aligned line, less the line separator. */ private int alignedLineWidth; /** * how many pending leading spaces are outstanding. * -1 = nothing outstanding. * 0 = comma * 1 = comma then one space. * m. */ private int pending = -1; /** * align a CSV file, constructor. Just create. There are no methods to call. * * @param file CSV file to be aligned in columns. * @param separator field separator character, usually ',' in North America, ';' in Europe and sometimes '\t' for * tab. * @param quote char to use to enclose fields containing a separator, usually '\"' . Use (char)0 if * you don't want a quote character. * @param commentChar char to use to introduce comments. Use (char) 0 if none. Only one character allowed. * @param verbose true if want extra status listing. * @param encoding encoding of input and output. * * @throws IOException if problems reading/writing file */ @SuppressWarnings( { "WeakerAccess" } ) public CSVAlign( final File file, final char separator, final char quote, final char commentChar, final boolean verbose, final Charset encoding ) throws IOException { // read file to get max widths pass1( file, separator, quote, commentChar, verbose, encoding ); // calculate the aggregate width of all the aligned fields. calcStartCols(); // read file again this time writing, widening fields to align. pass2( file, separator, quote, commentChar, encoding ); } /** * prepare an awkward string for output by enclosing it in " and doubling internal " * * @param s output field to prepare * @param quote char to use to enclose fields containing a separator, usually '\"' * * @return string enclosed in quotes with internal quotes doubled. */ private static String prepareAwkward( String s, char quote ) { if ( s.indexOf( quote ) < 0 ) { return quote + s + quote; } else { // StringBuilder is better than FastCat for char by char work. StringBuilder sb = new StringBuilder( s.length() + 10 ); sb.append( quote ); for ( int i = 0, n = s.length(); i < n; i++ ) { char c = s.charAt( i ); if ( c == quote ) { sb.append( quote ); sb.append( quote ); } else { sb.append( c ); } } // end for sb.append( quote ); return sb.toString(); } // end else } /** * calculate the with of the line after we have aligned it. */ private void calcStartCols() { int size = colDescriptorList.size(); colDescriptors = colDescriptorList.toArray( new ColumnDescriptor[ size ] ); colDescriptorList = null; alignedLineWidth = 0; for ( ColumnDescriptor cd : colDescriptors ) { cd.startCol = alignedLineWidth; /* allow one for comma and one for space */ alignedLineWidth += cd.maxWidth + 2; } } /** * append a field to the wholeLine left justified * * @param field field to add, with awkward encoding in place if any. * @param columnWidth width of the column */ private void leftJustify( String field, int columnWidth ) { if ( pending >= 0 ) { sb.append( ',' ); sb.append( ST.spaces( pending ) ); } sb.append( field ); // if there is a subsequent field we have a , plus spaces to pad, plus // space. pending = columnWidth - field.length() + 1; } /** * pass1 of aligning a CSV file, get the maximum column widths * * @param file CSV file to be aligned in columns. * @param separatorChar field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for * tab. * @param quoteChar char to use to enclose fields containing a separator, usually '\"' * @param commentChar char to use to introduce comments. Use (char) 0 if none. Only one character allowed. * @param verbose true if want extra status information * @param encoding encoding of input and output file. * * @throws IOException if problems reading */ private void pass1( final File file, final char separatorChar, final char quoteChar, final char commentChar, final boolean verbose, final Charset encoding ) throws IOException {// first pass read final String commentChars = ( commentChar == 0 ) ? "" : String.valueOf( commentChar ); // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, allowMultipleLineFields // on first pass ignore comments. // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, trimUnquoted, // allowMultipleLineFields final CSVReader r = new CSVReader( EIO.getBufferedReader( file, 64 * 1024, encoding ), separatorChar, quoteChar, commentChars, true, true /* trimQuoted */, true /* trimUnquoted */, true ); colDescriptorList = new ArrayList<>( 40 ); try { while ( true ) { final String[] fields = r.getAllFieldsInLine(); boolean wasComment = r.wasComment(); for ( int colIndex = 0; colIndex < fields.length; colIndex++ ) { if ( !wasComment || colIndex != fields.length - 1 ) { // not tail comment. if ( colIndex >= colDescriptorList.size() ) { colDescriptorList.add( new ColumnDescriptor() ); } ColumnDescriptor col = colDescriptorList.get( colIndex ); String field = fields[ colIndex ]; int width = field.length(); final boolean loneEmptyField = fields.length == ( wasComment ? 2 : 1 ) && fields[ 0 ].trim() .length() == 0; // does this field need surrounding quotes? boolean isAwkward = col.isAwkward || loneEmptyField || field.indexOf( separatorChar ) >= 0 || field.indexOf( quoteChar ) >= 0 || field.indexOf( commentChar ) >= 0; if ( isAwkward ) { // extra col for lead/trail quote. width += 2; // extra col for each doubled quote for ( int i = 0; i < field.length(); i++ ) { if ( field.charAt( i ) == quoteChar ) { width++; } } } if ( !col.isAwkward && isAwkward ) { col.isAwkward = true; // earlier non-awkward chars will be displayed in quotes. col.maxWidth += 2; if ( verbose ) { out.println( " Note : col " + colIndex + " quoted because of line " + r.lineCount () + " : " + field ); } } if ( width > col.maxWidth ) { col.maxWidth = width; } if ( !ST.isLegal( field, "0123456789.+-" ) ) { col.isNumeric = false; } if ( field.length() == "yyyy-mm-dd".length() && field.charAt( 4 ) == '-' && field.charAt( 7 ) == '-' ) { // left align col headers of dates. col.isNumeric = false; } } } } } catch ( EOFException e ) { // expected } r.close(); } /** * pass2 of aligning a CSV file, expand fields to the max width * * @param fileBeingProcessed CSV file to be aligned in columns. * @param separatorChar field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for * tab. * @param quoteChar char to use to enclose fields containing a separator, usually '\"' * @param commentChar char to use to introduce comments. Use (char) 0 if none. Only one character allowed. * @param encoding encoding of input and output file. * * @throws IOException if problems reading/writing file */ private void pass2( final File fileBeingProcessed, final char separatorChar, final char quoteChar, final char commentChar, final Charset encoding ) throws IOException { final String commentChars = ( commentChar == 0 ) ? "" : String.valueOf( commentChar ); // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, allowMultipleLineFields // on second pass, consider comments. // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, allowMultipleLineFields final CSVReader r = new CSVReader( EIO.getBufferedReader( fileBeingProcessed, 64 * 1024, encoding ), separatorChar, quoteChar, commentChars, false, true, true, true ); final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed ); // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim final PrintWriter w = EIO.getPrintWriter( tempFile, 32 * 1024, encoding ); // we don't write to a CSVWriter, since it collapses fields. try { while ( true ) { final String[] fields = r.getAllFieldsInLine(); final boolean wasComment = r.wasComment(); sb = new FastCat( fields.length * 4 ); pending = -1; // may have short line. for ( int colIndex = 0; colIndex < fields.length; colIndex++ ) { if ( wasComment && colIndex == fields.length - 1 ) { if ( colIndex == 0 ) { // comment on line by itself, on left sb.append( commentChar ); final String comment = fields[ 0 ]; if ( !( comment.length() > 0 && comment.charAt( 0 ) == commentChar ) ) { // keep ## together, else separate comment body by space. sb.append( ' ' ); sb.append( comment ); } else { sb.append( rebuildLabelComment( comment ) ); } } else { // tail comment, put out past end of all fields, all aligned. sb.append( ST.spaces( alignedLineWidth - sb.length() + 1 ) ); sb.append( commentChar ); sb.append( ' ' ); sb.append( fields[ colIndex ] ); } } else { final ColumnDescriptor col = colDescriptors[ colIndex ]; String field = fields[ colIndex ]; if ( col.isNumeric ) { // right justify, can't be awkward // spaces, field, comma, space rightJustify( field, col.maxWidth ); } else { // left justify // field, comma, space, spaces if ( col.isAwkward ) { // also handles loneEmptyField field = prepareAwkward( field, quoteChar ); } leftJustify( field, col.maxWidth ); } } } // end for w.write( ST.trimTrailing( sb.toString() ) ); w.write( lineSeparator ); } // end while } // end try catch ( EOFException e ) { out.println( r.lineCount() + " lines aligned." ); r.close(); // swap tempFile output and input now that output is safely created. w.close(); HunkIO.deleteAndRename( tempFile, fileBeingProcessed ); } } /** * comment has two lead comment chars, one stripped off, and are used to label the fields. * Reshape it so it labels the new fields. * * @param comment field names separated by commas, with lead comment char. * * @return reordered comment */ String rebuildLabelComment( String comment ) { final String[] fieldnames = SPLIT_ON_COMMA.split( comment.substring( 1 ) ); // strip second # final FastCat sb2 = new FastCat( fieldnames.length * 3 + 2 ); sb2.append( comment.charAt( 0 ) ); // rebuild just second # sb2.append( ' ' ); for ( int i = 0; i < fieldnames.length; i++ ) { if ( i >= colDescriptors.length ) { sb2.append( fieldnames[ i ].trim() ); } else { final ColumnDescriptor cd = colDescriptors[ i ]; int padding = cd.startCol - sb2.length() - 1 /* account for lead # applied later */; if ( padding > 0 ) { sb2.append( ST.spaces( padding ) ); padding = 0; } if ( cd.isNumeric ) { final int width = cd.maxWidth + padding; /* possibly negative padding */ sb2.append( ST.leftPad( fieldnames[ i ].trim(), width, false ) ); } else { sb2.append( fieldnames[ i ].trim() ); } } sb2.append( ", " ); } return sb2.toString().substring( 0, sb2.length() - 2 ); // chop off final , } /** * append a field to the wholeLine right justified * * @param field field to add, with awkward encoding in place if any. * @param columnWidth width of the column */ private void rightJustify( String field, int columnWidth ) { if ( pending >= 0 ) { sb.append( ',' ); sb.append( ST.spaces( pending ) ); } int spaces = columnWidth - field.length(); sb.append( ST.spaces( spaces ) ); sb.append( field ); pending = 1;/* comma plus space, if there is a subsequent field */ } /** * Simple command line interface to CSVAlign one file whose name is on the command line. Must have * extension .csv
Use java com.mindprod.CSVAlign somefile.csv * * @param args name of csv file to align in columns. */ public static void main( String[] args ) { if ( !( 1 <= args.length && args.length <= 2 ) ) { throw new IllegalArgumentException( USAGE ); } boolean verbose = false; String filename = ""; for ( String arg : args ) { if ( arg.equals( "-v" ) ) { verbose = true; } else { filename = arg; } } if ( !filename.endsWith( ".csv" ) ) { throw new IllegalArgumentException( "Bad extension.\n" + USAGE ); } final File file = new File( filename ); try { // file , separatorChar, quoteChar, commentChar , verbose new CSVAlign( file, ',', '\"', '#', verbose, CSV.UTF8 ); } catch ( IOException e ) { err.println(); e.printStackTrace( err ); err.println( "CSVAlign failed to align " + EIO.getCanOrAbsPath( file ) ); err.println(); } } // end main } /** * Facts about a single column in the CSV file, used internally by CSVAlign *

* created with Intellij Idea * * @author Roedy Green, Canadian Mind Products */ final class ColumnDescriptor { /** * true if one or more fields in this column require surrounding quotes. On output, all fields in an awkward column * will get surrounding quotes. If a field contains , or quote it needs the surrounding quotes. */ public boolean isAwkward = false; /** * if true all fields contain only digits dot, plus and minus. Assume column is numeric until proven otherwise by * finding a non-numeric char. */ public boolean isNumeric = true; /** * widest field in the column. Not including comma or space. If a column is "awkward" (needs surrounding quotes), * then the * length includes the quotes and any internal quoting. */ public int maxWidth = 0; /** * 0-based char column where this column starts */ public int startCol = 0; }