/* * [ExtractElectronicMacros.java] * * Summary: One shot to extract Electronic macros on a page, splitting them out to individual electronic files. * * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-02-11 initial version */ package com.mindprod.stores; import com.mindprod.commandline.CommandLine; import com.mindprod.common18.EIO; import com.mindprod.common18.Misc; import com.mindprod.csv.CSVWriter; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.AllButSVNDirectoriesFilter; import com.mindprod.filter.ExtensionListFilter; import com.mindprod.htmlmacros.macro.Global; import com.mindprod.htmlmacros.support.ConfigurationForMindprod; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.util.regex.Pattern; import static java.lang.System.*; /** * One shot to extract Electronic macros on a page, splitting them out to individual electronic files. *

* Do not run on the electronic dir, unless using -dry. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2012-02-11 initial version * @since 2012-02-11 */ public class ExtractElectronicMacros { /** * true if want extra debugging information */ private static final boolean DEBUGGING = false; /** * max number of electronic macros per page to expect */ private static final int MAX_ELECTRONICS_MACROS_PER_PAGE = 100; private static final String USAGE = "\nExtractElectronicMacros.exe Configuration -dry (dry run) -s (subdirs) dir file.html"; /** * split isbns at comma, gets rid of spaces around comma */ private static final Pattern SPLIT_ON_COMMA = Pattern.compile( "\\s*,\\s*" ); /** * where website files are kept */ private static File electronicDir; /** * replace the Book macro with a stripped-down Insert macro * * @param asin amazon number to use in Insert macro * @param title description * @param price UDS with lead $ * * @return generated Insert macro */ private static String buildInsertMacro( final String asin, final String title, final String price ) { final FastCat sb = new FastCat( 7 ); sb.append( "" ); return sb.toString(); } /** * extract one electronic macro as a separate file. * * @param oneMacro text of the macro * @param asin Amazon part number. Not a list * @param title title of the electronic product * @param price USD with $ * * @throws java.io.IOException if problems reading/writing file containing electronic macros. */ private static void createElectronicFile( final String oneMacro, final String asin, final String title, final String price ) throws IOException { // generate separate file to hold just one electronic macro. final FastCat sb = new FastCat( 6 ); sb.append( "\n" ); sb.append( oneMacro ); sb.append( "\n\n" ); final String fileContents = sb.toString(); String electronicFilename = asin + ".html"; // without this line compiler worries might not init File electronicFile = new File( electronicDir, electronicFilename ); // avoid overwriting existing file for ( char letter = 'a'; letter < 'z'; letter++ ) { electronicFile = new File( electronicDir, electronicFilename ); if ( !electronicFile.exists() ) { break; } else { electronicFilename = asin + letter + ".html"; } } HunkIO.writeEntireFile( electronicFile, fileContents, HunkIO.UTF8 ); } /** * log an electronic macro we found. Used to generate table of electronic products to include * * @param w where to log to * @param asin asin of product, not a list * @param title title */ private static void logElectronicFound( final CSVWriter w, final String asin, final String title ) { w.put( asin ); w.put( title ); w.nl(); } /** * find values of parm of the form xxx="..." or xxx={...} * * @param parmName name of param to search for, case-sensitive * @param searchIn text in which to search for parms, one electronic macro. * * @return value of the parm. null if missing, "" if empty. */ private static String parseOneParm( String parmName, String searchIn ) { int start = searchIn.indexOf( parmName + "=" ); if ( start < 0 ) { return null; // indicate no such parm } final int size = searchIn.length(); start += parmName.length() + 1; if ( start >= size ) { throw new IllegalArgumentException( "Truncated parm= : " + parmName ); } final char c = searchIn.charAt( start ); final int end; switch ( c ) { case ' ': start++; if ( start >= size ) { throw new IllegalArgumentException( "Truncated parm= : " + parmName ); } end = searchIn.indexOf( ' ', start ); if ( end == start ) { throw new IllegalArgumentException( "Malformed parm= : " + parmName ); } break; case '\"': start++; if ( start >= size ) { throw new IllegalArgumentException( "Truncated parm= : " + parmName ); } end = searchIn.indexOf( '\"', start ); break; case '{': start++; if ( start >= size ) { throw new IllegalArgumentException( "Truncated parm= : " + parmName ); } end = searchIn.indexOf( '}', start ); break; default: // e.g. birth=1948-02-04 end = searchIn.indexOf( ' ', start ); break; } if ( end < 0 ) { throw new IllegalArgumentException( "Malformed parm= missing terminator : " + parmName ); } return searchIn.substring( start, end ); } /** * @param args names of files to extract electronic macros from, -s etc.. * -dry used when creating data for summary electronics page. Does not replace ith Insert. * * @throws java.io.IOException if trouble reading or writing files containing electronic macros. */ public static void main( String[] args ) throws IOException { Global.installConfiguration( new ConfigurationForMindprod() ); final File webrootDir = new File( Global.configuration.getLocalWebrootWithSlashes() ); electronicDir = new File( webrootDir, "electronic" ); out.println( "Gathering html files to extract..." ); final boolean dry; if ( args.length > 1 && args[ 1 ].equals( "-dry" ) ) { args[ 1 ] = null; dry = true; } else { dry = false; } CommandLine commandLine = new CommandLine( args, new AllButSVNDirectoriesFilter(), new ExtensionListFilter( "html" ) ); if ( commandLine.size() == 0 ) { throw new IllegalArgumentException( "No files found to process\n" + USAGE ); } final CSVWriter w = new CSVWriter( EIO.getPrintWriter( new File( "C:/temp/electronicmacros.csv" ), 4 * 1024, EIO.UTF8 ) ); for ( File sourceFile : commandLine ) { final String big = HunkIO.readEntireFile( sourceFile, HunkIO.UTF8 ); // compose shrunken file, with ", start + "" ); } end += " -->".length(); final String oneMacro = big.substring( start, end ).replaceAll( "\\s+", " " ); try { final String asins = parseOneParm( "asin", oneMacro ); final String title = parseOneParm( "title", oneMacro ); // we do not reflow, // we want all on one line for CSV final String price = parseOneParm( "price", oneMacro ); if ( asins == null || title == null ) { throw new IllegalArgumentException( "missing mandatory parm" ); } final String asin = SPLIT_ON_COMMA.split( asins )[ 0 ]; if ( DEBUGGING && dry ) { out.println( "--- asin={" + asin + "} title={" + title + "} price={" + price + "}" ); } logElectronicFound( w, asin, title ); if ( !dry ) { // create a separate file createElectronicFile( oneMacro, asin, title, price ); // replace electronic macro with an Insert macro // text ahead of