/* * [CollectFamousBorns.java] * * Summary: Find Author/Birth/Death in near said or famous people. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-10-18 initial version */ package com.mindprod.repair; import com.mindprod.entities.DeEntifyStrings; import java.io.File; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Find Author/Birth/Death in near said or famous people. *

* Does not change the html files. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2011-10-18 initial version * @since 2011-10-18 */ class CollectFamousBorns { private static final boolean DEBUGGING = false; /** * pattern of quote with Born with two dates. Ignore quoth generated quotations. */ private static final Pattern FAMOUS_DEAD_PATTERN = Pattern.compile( "([^<]+)\\s*", Pattern.DOTALL ); /** * pattern of quote with Born with one date. Ignore quoth generated quotations. . */ private static final Pattern FAMOUS_LIVING_PATTERN = Pattern.compile( "([^<]+)\\s*", Pattern.DOTALL ); /** * pattern of quote already handled, used to extract author */ private static final Pattern MULTI_AUTHOR_SPLIT = Pattern.compile( "\\s+and\\s+|\\s+with\\s+|\\s+aka\\s+|,|&|\\[|\\]" ); /** * find all famous or said borns on page. Add details of authors to HashMap * * @param big the whole page * @param fileBeingProcessed file we are processing */ static void collectBornsForPage( final String big, final File fileBeingProcessed ) { final Matcher living = FAMOUS_LIVING_PATTERN.matcher( big ); // find names of authors and birth and death dates inside and outside Blockquotes while ( living.find() ) { if ( DEBUGGING ) { for ( int i = 0; i <= living.groupCount(); i++ ) { out.println( ">>living group: " + i + " : " + living.group( i ) ); } } processOneBornMacro( living.group( 2 ), living.group( 3 ), "", fileBeingProcessed ); } final Matcher dead = FAMOUS_DEAD_PATTERN.matcher( big ); while ( dead.find() ) { if ( DEBUGGING ) { for ( int i = 0; i <= dead.groupCount(); i++ ) { out.println( ">>dead group: " + i + " : " + dead.group( i ) ); } } processOneBornMacro( dead.group( 2 ), dead.group( 3 ), dead.group( 4 ), fileBeingProcessed ); } } /** * process author and birth/date dates * * @param allAuthors list of authors (ignore all but first ) * @param birthdate unknown or yyyy-mm-dd * @param deathdate "" or yyyy-mm-dd * @param fileBeingProcessed file we are processing */ private static void processOneBornMacro( String allAuthors, final String birthdate, final String deathdate, final File fileBeingProcessed ) { allAuthors = DeEntifyStrings.stripHTMLTags( allAuthors ); final String[] individualAuthors = MULTI_AUTHOR_SPLIT.split( allAuthors ); final String author = individualAuthors.length >= 1 ? individualAuthors[ 0 ].trim() : ""; final String authorFlat = DeEntifyStrings.deEntifyHTML( author, ' ' ); final AuthorBio old = AuthorBio.get( authorFlat ); final AuthorBio recent = new AuthorBio( author, birthdate, deathdate, fileBeingProcessed ); // we don't keep that AuthorBio object unless it is the first for this author. if ( old == null ) { if ( DEBUGGING ) { out.println( " adding : " + recent ); } AuthorBio.put( authorFlat, recent ); } else if ( old.getBirthdate().equals( birthdate ) && old.getDeathdate().equals( "" ) && deathdate.length() > 0 ) { AuthorBio.put( authorFlat, recent ); // update deathdate info } else if ( !( old.getBirthdate().equals( birthdate ) && old.getDeathdate().equals( deathdate ) ) ) { AuthorWarning.warn( "date inconsistent with file date", old, recent ); } } }