/* * [DeDupQuotes.java] * * Summary: Get rid of duplicates within a single unsorted quotation file. Works best with StripGenerated files. * * Copyright: (c) 2015-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2015-12-31 initial version */ package com.mindprod.htmlmacros; import com.mindprod.common18.FNV1a64; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.fastcat.FastCat; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.regex.Pattern; import static java.lang.System.*; /** * Get rid of duplicates within a single unsorted quotation file. Works best with StripGenerated files. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2015-12-31 initial version * @since 2015-12-31 */ public class DeDupQuotes { /** * what we look for surrounding text of interest */ private static final String CLOSE_TAG = ""; /** * what we look for surrounding text of interest */ private static final String OPEN_TAG = " extantQuotes = new HashSet<>( 10000 ); private static final Pattern SPLIT_ON_SPACE = Pattern.compile( "\\s+" ); /** * accumulate new version of file */ private static final FastCat sb = new FastCat( 10_000 ); /** * original contents of from file */ private static String big; /** * file we are deduping */ private static File file; /** * dedup one file. */ private static void deDupOneFile() throws IOException { int dupCount = 0; int quoteCount = 0; int start0 = 0; int start1; while ( ( start1 = big.indexOf( OPEN_TAG, start0 ) ) >= 0 ) { final int start2 = start1 + OPEN_TAG.length(); final int end1 = big.indexOf( CLOSE_TAG, start2 ); if ( end1 < 0 ) { throw new IllegalArgumentException( ">>>missing " ); } final int end2 = end1 + CLOSE_TAG.length(); final String rawBlockQuote = big.substring( start1, end2 ); final String plainBlockQuote = DeEntifyStrings.flattenHTML( rawBlockQuote, ' ' ); // process text
// break into words final String[] words = SPLIT_ON_SPACE.split( plainBlockQuote ); long hash = FNV1a64.computeHash( words ); boolean unique = extantQuotes.add( hash ); if ( unique ) { // keep quote quoteCount++; sb.append( rawBlockQuote ); } else { // delete dupCount++; out.println( "Deleting Duplicate:" ); out.println( rawBlockQuote ); out.println(); } start0 = end2; } // end while out.println( "File " + file + " now contains " + quoteCount + " quotations. Deleted " + dupCount + " duplicates." ); } /** * handle the footer for the from file */ private static void handlePostlude( final boolean dry ) throws IOException { final int p = big.lastIndexOf( "