/*
* [RepairFamousBorns.java]
*
* Summary: Fill in birth/death dates of authors if they don't have them already.
*
* Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2011-10-20 initial version
*/
package com.mindprod.repair;
import com.mindprod.entities.DeEntifyStrings;
import com.mindprod.fastcat.FastCat;
import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Fill in birth/death dates of authors if they don't have them already.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2011-10-20 initial version
* @since 2011-10-20
*/
class RepairFamousBorns
{
/**
* pattern of quote with Born with two dates. Ignore quoth generated quotations.
*/
private static final Pattern FAMOUS_DEAD_PATTERN = Pattern.compile(
"([^<]+)\\s*",
Pattern.DOTALL
);
/**
* pattern of quote with Born with one date. Ignore quoth generated quotations. .
*/
private static final Pattern FAMOUS_LIVING_PATTERN = Pattern.compile(
"([^<]+)\\s*",
Pattern.DOTALL
);
/**
* pattern of quote already handled, used to extract author
*/
private static final Pattern MULTI_AUTHOR_SPLIT = Pattern.compile(
"\\s+and\\s+|\\s+with\\s+|\\s+aka\\s+|,|&|\\[|\\]" );
/**
* total count of corrections. Reported elsewhere
*/
static int corrected;
/**
* repair all dead borns on page.
*
* @param big the whole page
* @param fileBeingProcessed file we are processing
*
* @return corrected page.
*/
private static String repairDeadBornsForPage( final String big, final File fileBeingProcessed )
{
// do not change to StringBuilder
final StringBuffer sb = new StringBuffer( big.length() * 2 );
final Matcher dead = FAMOUS_DEAD_PATTERN.matcher( big );
while ( dead.find() )
{
final String replacement = repairOneBornMacro( dead.group( 1 ), dead.group( 2 ), dead.group( 3 ),
dead.group( 4 ),
fileBeingProcessed );
if ( replacement != null )
{
dead.appendReplacement( sb, Matcher.quoteReplacement( replacement ) );
}
}
dead.appendTail( sb );
return sb.toString();
}
/**
* repair all living borns on page.
*
* @param big the whole page
* @param fileBeingProcessed file we are processing
*
* @return corrected page.
*/
static String repairFamousBornsForPage( final String big, final File fileBeingProcessed )
{
return repairDeadBornsForPage( repairLivingBornsForPage( big, fileBeingProcessed ), fileBeingProcessed );
}
/**
* repair all living borns on page.
*
* @param big the whole page
* @param fileBeingProcessed file we are processing
*
* @return corrected page.
*/
private static String repairLivingBornsForPage( final String big, final File fileBeingProcessed )
{
// do not change to StringBuilder
final StringBuffer sb = new StringBuffer( big.length() * 2 );
final Matcher living = FAMOUS_LIVING_PATTERN.matcher( big );
// find names of authors and birth and death dates inside and outside Blockquotes
while ( living.find() )
{
final String replacement = repairOneBornMacro( living.group( 1 ), living.group( 2 ), living.group( 3 ), "",
fileBeingProcessed );
if ( replacement != null )
{
living.appendReplacement( sb, Matcher.quoteReplacement( replacement ) );
}
}
living.appendTail( sb );
return sb.toString();
}
/**
* Replace missing or non-matching date
*
* @param saidOrFamous string "said" or "famous" which type of author
* @param allAuthors author
* @param birthdate unknown or yyyy-mm-dd
* @param deathdate "" or yyyy-mm-dd
* @param fileBeingProcessed file we are processing
*
* @return replacement string or null if no change needed.
*/
private static String repairOneBornMacro( String saidOrFamous,
String allAuthors,
final String birthdate,
final String deathdate,
final File fileBeingProcessed )
{
allAuthors = DeEntifyStrings.stripHTMLTags( allAuthors );
final String[] individualAuthors = MULTI_AUTHOR_SPLIT.split( allAuthors );
final String author = individualAuthors.length >= 1 ? individualAuthors[ 0 ].trim() : "";
final String authorFlat = DeEntifyStrings.deEntifyHTML( author, ' ' );
final AuthorBio old = AuthorBio.get( authorFlat );
final AuthorBio recent = new AuthorBio( author, birthdate, deathdate, fileBeingProcessed );
// we don't keep that AuthorBio object unless it is the first for this author.
if ( old != null && !( old.getBirthdate().equals( birthdate ) && old.getDeathdate().equals( deathdate ) ) )
{
corrected++;
AuthorWarning.warn( "inconsistent birth/death dates. Attempting correction.", old, recent );
final FastCat sb = new FastCat( 10 );
sb.append( "" );
sb.append( allAuthors );
sb.append( "" );
sb.append( " " ); // leave out checked. It is not trusted now.
return sb.toString();
}
else
{
return null;
}
}
// no main
}