/* * [QuoteFlock.java] * * Summary: categories of quotation collections, file of quotations that are combined into aggregate categories. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.8 2009-02-06 include go package in ZIP bundle. * 1.9 2012-09-22 put minQuotes into an external file and make it self-maintaining. * 2.0 2014-02-12 add featured yyyy-mm-dd support */ package com.mindprod.htmlmacros.support; import com.mindprod.common18.BigDate; import com.mindprod.common18.Build; import com.mindprod.common18.EIO; import com.mindprod.common18.PrintSorted; import com.mindprod.common18.ST; import com.mindprod.csv.CSVReader; import com.mindprod.csv.CSVWriter; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.fastcat.FastCat; import com.mindprod.htmlmacros.macro.Global; import com.mindprod.hunkio.HunkIO; import java.io.DataOutputStream; import java.io.EOFException; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import static com.mindprod.htmlmacros.macro.Global.configuration; import static java.lang.System.*; /** * categories of quotation collections, file of quotations that are combined into aggregate categories. *
* Here we extract quotes from files in \quotes directory and put them into one big random access * file allquotes.bin indexed by offset. We do not create files for each aggregate. * Aggregates are made up of flocks which in turn are made of up individual quotations. * * @author Roedy Green, Canadian Mind Products * @version 2.0 2014-02-12 add featured yyyy-mm-dd support * @see com.mindprod.htmlmacros.SortQuotes * @since 2009 */ public enum QuoteFlock { // A D D I N G A N E W F L O C K // 1. use e:\env\tquotes.btm frequency xxxx to pull quotes to top containing keyword. // 2. create a new quote file E:\mindprod\quote\xxxx.html // 3. add new enum to QuoteFlock. // 4. add new Flock to various QuoteAggregates. // 5. cyclea // 6. assign a minquotes value to xxxx in E:/mindprod/quote/flocks.csv. Adjust other minvalues. // 7. use e:\env\tquotes.btm plain to put quotes in alpha order. // // These are the basic categories/flocks. They are combined in various ways into aggregate categories. // These correspond to files E:\mindprod\quote\animalrights.html etc. // You will want to add your new flock to some of the Flock aggregates at QuoteAggregate // and then assign aggregates to footer pages at getQuoteAggregateForFooter // Also add a non-zero starter minQuotes entry in embellishment/flock.csv // Add an embellishment/fastcats.csv entry for the new quote file. // add link in ethics/quote.html // descriptions used on console and generated QuoteHead ABORTION( "Abortion Quotes" ), ABORTIONBYROEDY( "Abortion Quotes by Roedy" ), ANIMALRIGHTS( "Animal Rights Quotes" ), ANIMALRIGHTSBYROEDY( "Animal Rights Quotes by Roedy" ), BIBLE( "Bible Quotes" ), CANADA( "Canadian Politics Quotes" ), CANADABYROEDY( "Canadian Politics Quotes by Roedy" ), CHRISTIAN( "Christian Quotes" ), CHRISTMAS( "Christmas Quotes" ), CREATIONISM( "Creationism Quotes" ), CREATIONISMBYROEDY( "Creationism Quotes by Roedy" ), ENVIRONMENT( "Environmental Quotes" ), ENVIRONMENTBYROEDY( "Environmental Quotes by Roedy" ), EUTHANASIA( "Euthanasia Quotes" ), EUTHANASIABYROEDY( "Euthanasia Quotes by Roedy" ), EVOLUTION( "Evolution Quotes" ), EVOLUTIONBYROEDY( "Evolution Quotes by Roedy" ), GANDHI( "Gandhi Quotes" ), GAYMARRIAGE( "Gay Marriage Quotes" ), GUNS( "Gun Quotes" ), HARPER( "Stephen Harper Quotes" ), HEALTH( "Health Care Quotes" ), HOMOPHOBIA( "Homophobia Quotes" ), ISLAM( "Islam Quotes" ), ISRAEL( "Israel Quotes" ), ISRAELBYROEDY( "Israel Quotes by Roedy" ), JESUS( "Quotes about Jesus" ), KLEIN( "Naomi Klein Quotes" ), LIVINGLOVE( "Living Love Quotes" ), MISC( "Miscellaneous Quotes" ), MISCBYROEDY( "Miscellaneous Quotes by Roedy" ), MONEY( "Money Quotes" ), MONEYBYROEDY( "Money Quotes by Roedy" ), NAPOLEON( "Napoléon Bonaparte Quotes" ), PATRIOTISM( "Patriotism Quotes" ), PATRIOTISMBYROEDY( "Patriotism Quotes by Roedy" ), POLITICS( "Politics Quotes" ), POLITICSBYROEDY( "Politics Quotes by Roedy" ), PROGRAMMING( "Computer Programming Quotes" ), PROGRAMMINGBYROEDY( "Computer Programming Quotes by Roedy" ), RELIGION( "Religion Quotes" ), RELIGIONBYROEDY( "Religion Quotes by Roedy" ), SEPARATIONOFCHURCHANDSTATE( "Separation of church and state" ), SEXUALITY( "Sexuality and Homosexuality Quotes" ), SEXUALITYBYROEDY( "Sexuality and Homosexuality Quotes by Roedy" ), TRUMP( "Quotes by and about Donald Trump" ), WAR( "War Quotes" ), WARBYROEDY( "War Quotes by Roedy" ); /** * average quotation size. Smaller is more conservative, reserves more slots */ private static final int AVERAGE_QUOTE_LENGTH = 490; /** * request extra debug out put */ private static final boolean DEBUGGING = false; /** * amount of head/tail padding in the quote source files in bytes. */ private static final int COMBINED_HEAD_TAIL_PADDING = 8227; /** * How much of the start of the quote to compare looking for dups. */ private static final int LEAD_LENGTH = 200; /** * name of file where we cacheFIle quotes from all flocks. */ private static final String ALL_QUOTES_BIN = "allquotes.bin"; /** * where the quote flock size definitions are kept. */ private static final String FLOCKS_CSV = "quote/flocks.csv"; /** * root directory of the local websites */ private static final File root = new File( configuration.getLocalWebrootWithSlashes() ); /** * extract !-- composed yyyy-mm-dd o partial dates. static not visible in constructor */ private static final Pattern FIND_ADDED = Pattern.compile( "(?i) */ private static final Pattern FIND_CSSCLASS = Pattern.compile( "(?i)0 ) { final String missingTotalString = ST.rightJustified( missingTotal, 6, true ); final String quoteTotalString = ST.rightJustified( quoteTotal, 6, true ); final String percentString = ST.leftPad( DF1.format( missingTotal * 100 / quoteTotal ), 6, true ); ps.add( missingTotalString, missingTotalString + " " + quoteTotalString + " " + percentString + "% total for all flocks" ); pout.println( ST.spaceOut( "Missing<><>Warning: missing css class on
<><>Warning: invalid date in " + this.getFile() + "\n" + quotation ); } } else { err.println( "\n<><><>Warning: missing in " + this.getFile() + "\n" + quotation ); } final Matcher birthdateMatcher = FIND_BIRTHDATE.matcher( quotation ); if ( birthdateMatcher.find() && BigDate.isAnniversary( new BigDate( birthdateMatcher.group( 1 ) ), Global.TODAY ) ) { weight += 16; } final Matcher deathdateMatcher = FIND_DEATHDATE.matcher( quotation ); if ( deathdateMatcher.find() ) { final BigDate deathDate = new BigDate( deathdateMatcher.group( 1 ) /* yyyy-mm-dd*/ ); if ( deathDate.equals( Global.TODAY ) ) { // died today weight += 40; } else if ( deathDate.equals( Global.YESTERDAY ) ) { // died yesterday weight += 40; } else if ( deathDate.equals( Global.DAY_BEFORE_YESTERDAY ) ) { // died day before yesterday weight += 30; } else if ( BigDate.isAnniversary( deathDate, Global.TODAY ) ) { weight += 8; } } final Matcher weightMatcher = FIND_WEIGHT.matcher( quotation ); if ( weightMatcher.find() ) { weight += Math.max( 0, Math.min( Integer.parseInt( weightMatcher.group( 1 ) ), 20 ) ); } final Matcher featuredMatcher = FIND_FEATURED.matcher( quotation ); if ( featuredMatcher.find() ) { final BigDate featuredDate = new BigDate( featuredMatcher.group( 1 ) /* yyyy-mm-dd*/ ); if ( featuredDate.equals( Global.TODAY ) ) { weight = 0; // avoid reusing if on home page featured today. } else { weight += 4; // it was once featured, give it extra weight. } } return weight; } /** * fetch the quoteInFlockIndex-th quotation for this category off disk. * Opens and closes the file each time. Macros not yet expanded. * * @param quoteInFlockIndex which quotation in the set you want. * * @return fetch the indexed quotation of disk in the cacheFIle. */ private String fetchQuotation( int quoteInFlockIndex ) { try { assert allQuotesRaf != null : ALL_QUOTES_BIN + " random access file not open"; assert 0 <= quoteInFlockIndex && quoteInFlockIndex < offsets.length : "Programing bug: out of bounds " + "quotation number"; // R E A D allQuotesRaf.seek( offsets[ quoteInFlockIndex ]/* byte offsets in file */ ); // readUTF will figure out how long the quote is. return allQuotesRaf.readUTF(); } catch ( IOException e ) { err.println(); e.printStackTrace( err ); err.println( "Serious program bug. Unable to fetch a quotation from " + this.toString().toLowerCase() + " disk cacheFIle. quoteInFlockIndex:" + quoteInFlockIndex + " offset:" + offsets[ quoteInFlockIndex ] ); err.println(); System.exit( 2 ); } // can't get here return ""; } /** * read all quotes for flock during initialisation. * Quotes are presorted with the SortQuotes utility. * They may be stripgenerated or expanded. It is better if they are stripgenerated * so that the temporary library of quotes in RAM will be more compact. * * @return array of Strings including, and optional macro expansions., * @throws java.io.IOException if cannot read quotes */ private String[] getAllQuotesInFlock() throws IOException { if ( FAKE ) { // we provide a stripped down quotation list so faster for debugging return new String[] { "".length(); String candidate = allQuotes.substring( start1, end2 ).trim(); // replace = 0 ) { candidate = candidate.substring( 0, said ) + "class=\"quoth\"" + candidate.substring( said + "class=\"said\"".length() ); } // will be automatically deduped as a side effect of put. final String[] lines = candidate.split( "\\n\\s*" ); final FastCat sb = new FastCat( lines.length ); for ( String line : lines ) { sb.append( line.trim() );// trim lead and trail blanks } // remove the final \n candidate = sb.toSeparatedList( '\n' ); // treats space and \n as identical in case formatted differently. String lead = DeEntifyStrings.flattenHTML( candidate, ' ' ); lead = lead.substring( 0, Math.min( lead.length(), LEAD_LENGTH ) ); lead = lead.toLowerCase(); lead = lead.replace( '\n', ' ' ); lead = lead.trim(); final String prev = a.put( lead, candidate ); if ( prev != null ) { err.println( "Warning possible duplicate " + quoteFile + " quotation ignored :\n" + candidate + "\n\n" ); } else { if ( !candidate.contains( "Bohemian Rhapsody is considered the best rock\n" + "song of all time. Freddy Mercury put enough variety and complexity into it so that it\n" + "could stand up, like Bach, to repeated hearings.\n" + "\n" }; } final File quoteFile = this.getFile(); final int estQuotesInFlock = Math.max( 100, ( ( ( int ) quoteFile.length() ) - COMBINED_HEAD_TAIL_PADDING ) / AVERAGE_QUOTE_LENGTH ); // look up lead to get quote. Used to dedup. final LinkedHashMap
\n" + "~ Roedya = new LinkedHashMap<>( estQuotesInFlock * 130 / 100 ); // temporarily hold entire file of quotes in RAM and HashMap of leads and quotes as substrings of it. final String allQuotes = HunkIO.readEntireFile( quoteFile ); // we can't expandNoRef macros here, since we don't know the where they will eventually end up. int place = 0; // how many quotes in this flock are missing headers. // we read the entire flock into ram, scan for quotes, dedup them, put them into an array. // write them to disk in counted binary format. We don't modify the original flock file. // That is done by SortQuotes after extensive changes to the quote files. while ( true ) { // quote is inside sandwich ...int start1 = allQuotes.indexOf( "", start2 ); if ( end1 < 0 ) { throw new IllegalArgumentException( "unbalanced" ); } final int end2 = end1 + "return a.values().toArray( new String[ a.size() ] ); } /** * save set of quotations for just this flock on disk, creating an numeric index to the offsets where they are. * Used during initialisation. * * @param allQuotesInFlock temporary ram-based array of all quotations in the category. * * @throws IOException if cannot save quotes to disk */ private void saveAllQuotesInFlock( String[] allQuotesInFlock ) throws IOException { // temporary ArrayList to build list of offsets to quotations on disk. final ArrayList
buildOffsets = new ArrayList<>( allQuotesInFlock.length * 2 ); // We are writing these out in original (usually alphabetically sorted) order /* total length of all quotes in this flock, used to compute average quote length. */ int totalQuoteLength = 0; for ( String quotation : allQuotesInFlock ) { // track where on disk we wrote this quotation final int weight = calcWeight( quotation ); final int offset = allQuotesDos.size(); // This is efficient. It does not imply a flush. for ( int i = 0; i < weight; i++ ) { // insert one duplicate entry for each weight point. Quote itself is not duplicated.. buildOffsets.add( offset ); } totalQuoteLength += quotation.length(); // write each quotation in its own counted string. // we track where each starts allQuotesDos.writeUTF( quotation ); // only one copy. } if ( this.ordinal() == QuoteFlock.values().length - 1 ) { // close the file on the last enum constant allQuotesDos.close(); } // don't need RAM-hogging Strings any more, caller will soon dispose of them. All we need is the offset[]. if ( DEBUGGING ) { final FastCat sb = new FastCat( 8 ); sb.append( ST.leftPad( Integer.toString( allQuotesInFlock.length ), 4, false ), " quotations, " ); sb.append( ST.leftPad( Integer.toString( buildOffsets.size() ), 4, false ), " weighted possibilities, " ); sb.append( ST.leftPad( Integer.toString( totalQuoteLength / allQuotesInFlock.length ), 4, false ), " average length for " ); sb.append( this.getDescription() ); out.println( sb ); } if ( allQuotesInFlock.length < minQuotes ) { err.println( ">>> Flock " + this.name().toLowerCase() + ".html should have at least " + minQuotes + " quotations. It has only " + allQuotesInFlock.length + ". Collection may be damaged. Adjust mindprod/quote/flocks.csv" ); } else { minQuotes = allQuotesInFlock.length; /* record new count */ } // save list in compact array form. offsets = new int[ buildOffsets.size() ]; // Don't use toArray because we want int[], not Integer[] int i = 0; for ( Integer offset : buildOffsets ) { offsets[ i++ ] = offset; } } /** * Because of problems with initialisation order problems in the constructor, and static init, * we move initialisation code to an explicit initialise method * that gets called by Fireup.fireup. */ public static void fireup() { try { // get minimum number of quotes to expect in each flock final File flockCSVFile = new File( root, FLOCKS_CSV ); // only 600 bytes final CSVReader r = new CSVReader( EIO.getBufferedReader( flockCSVFile, 1024, EIO.UTF8 ) ); try { while ( true ) { try { final String name = r.get(); if ( name == null ) { throw new IOException( "null flock name in flocks.csv file" ); } final QuoteFlock qf = QuoteFlock.valueOf( name ); qf.minQuotes = r.getInt(); r.skipToNextLine(); } catch ( IllegalArgumentException e ) { err.println( e ); } } } catch ( EOFException e ) { r.close(); } // ensure all flocks have a minQuotes value. for ( QuoteFlock qf : QuoteFlock.values() ) { if ( qf.minQuotes == 0 ) { err.println( qf.name() + " not assigned a minQuotes value in mindprod/quote/flocks.csv" ); System.exit( 1 ); } } // On first use, we must create and open the file // Make sure directory for cacheFile file pre-exists. final File dir = new File( configuration.getScratchAreaWithBackslashes() ); if ( !dir.exists() ) { //noinspection ResultOfMethodCallIgnored dir.mkdir(); } // O P E N // e.g. E:\temp\allquotes.bin final File allQuotesFile = new File( configuration.getScratchAreaWithBackslashes(), ALL_QUOTES_BIN ); allQuotesDos = EIO.getDataOutputStream( allQuotesFile, 32 * 1024 ); for ( QuoteFlock flock : QuoteFlock.values() ) // basic files of quotes, not aggregates { try { final String[] quotationsForFlock = flock.getAllQuotesInFlock(); flock.quoteCount = quotationsForFlock.length; if ( quotationsForFlock.length <= 0 ) { err.println( "Serious error: empty quotation, probably corrupt, " + "quotations file " + flock.name() ); System.exit( 2 ); } flock.saveAllQuotesInFlock( quotationsForFlock ); } catch ( IOException e ) { err.println( "Fatal error. Unable read " + flock.toString() + " quotations file. " + "Possibly in use by editor." ); System.exit( 2 ); } } allQuotesDos.close(); // show missing headers for all flocks ReportMissingHeaders(); // save new minQuotes final CSVWriter w = new CSVWriter( EIO.getPrintWriter( flockCSVFile, 1024, EIO.UTF8 ) ); for ( QuoteFlock qf : QuoteFlock.values() ) { w.put( qf.name() ); w.put( qf.minQuotes ); w.nl(); } w.close(); // O P E N allQuotesRaf = new RandomAccessFile( new File( configuration.getScratchAreaWithBackslashes(), ALL_QUOTES_BIN ), "r" ); // arrange to close and delete the allquotes file on exit. } catch ( IOException e ) { err.println( e.getMessage() ); err.println( "Serious program bug. Unable to read and save a quotation to allquotes.bin" ); System.exit( 2 ); } } public static void shutdown() { try { allQuotesRaf.close(); //noinspection ResultOfMethodCallIgnored,ResultOfMethodCallIgnored,ResultOfMethodCallIgnored, // ResultOfMethodCallIgnored new File( configuration.getScratchAreaWithBackslashes(), ALL_QUOTES_BIN ).delete(); } catch ( IOException e ) { err.println( "unable to close " + ALL_QUOTES_BIN + " file." ); } } /** * get description of flock, used to regenerated QuoteFlockHead and label flocks on console. * * @return description */ public String getDescription() { return description; } /** * get file where quotes for that flock are stored. */ public File getFile() { return new File( root, "quote/" + name().toLowerCase() + ".html" ); } /** * Get a random quotation for a file, macros not yet expanded. * * @param changeIntervalInMillis how often to change the ad or quotation. * @param fileBeingProcessed the file currently being processed. * * @return the text of a quotation including ...*/ @SuppressWarnings( { "SameParameterValue" } ) public String getRandomQuotationForFile( long changeIntervalInMillis, File fileBeingProcessed ) { assert allQuotesRaf != null : ALL_QUOTES_BIN + " random access file not open"; final int hash = Randomiser.getHashForFilename( changeIntervalInMillis, fileBeingProcessed ); // returns 0..offsets.length-1 // no weights here. Weighting handled by dup quotes. final int quoteInFlockIndex = Randomiser.getRandomSelectorIndexForHash( hash, this.offsets.length ); return this.fetchQuotation( quoteInFlockIndex ); } }