/* * [TableToCSV.java] * * Summary: Extracts rows in CSV tables to CSV form. Extracts data from all tables in the input. Output in xxx.csv. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-01-23 initial version. * 1.1 2011-01-25 allow you to specify encoding */ package com.mindprod.csv; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.nio.charset.Charset; import static java.lang.System.*; /** * Extracts rows in CSV tables to CSV form. Extracts data from all tables in the input. Output in xxx.csv. *
* Use: java.exe com.mindprod.TableToCSV xxxx.html * It also strips tags and converts entities back to UTF-8 characters. * * @author Roedy Green, Canadian Mind Products * @version 1.1 2011-01-25 allow you to specify encoding * @since 2011-01-23 */ public final class TableToCSV { /** * how to use the command line */ private static final String USAGE = "\nTableToCSV needs the name of an *.html or *.htmlfrag file on the commandline, " + "nothing else. Output will be in xxx.csv."; /** * Constructor to convert an HTML table to CSV. Strips out entities and tags. * * @param file CSV file to be packed to remove excess space and quotes. * @param separatorChar field separator character, usually ',' in North America, * ';' in Europe and sometimes '\t' for * tab for the output file. It is tab for the input file. * Note this is a 'char' not a "string". * @param quoteChar character used to quote fields containing awkward chars. * @param commentChar character to treat as comments. * @param encoding encoding of the input and output file. * * @throws java.io.IOException if problems reading/writing file */ @SuppressWarnings( { "WeakerAccess" } ) public TableToCSV( final File file, final char separatorChar, final char quoteChar, final char commentChar, final Charset encoding ) throws IOException { String outFilename = EIO.getCanOrAbsPath( file ); outFilename = ST.chopTrailingString( outFilename, ".htmlfrag" ); outFilename = ST.chopTrailingString( outFilename, ".html" ); outFilename += ".csv"; final File outFile = new File( outFilename ); // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim final PrintWriter pw = EIO.getPrintWriter( outFile, 32 * 1024, encoding ); final CSVWriter w = new CSVWriter( pw, 0 /* minimal */, separatorChar, quoteChar, commentChar, true ); // read the entire html file into RAM. String big = HunkIO.readEntireFile( file, encoding ); int from = 0; // our parser is forgiving, works even if missing. while ( true ) { // find