/* * [RepairFamousBorns.java] * * Summary: Fill in birth/death dates of authors if they don't have them already. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-10-20 initial version */ package com.mindprod.repair; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.fastcat.FastCat; import java.io.File; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Fill in birth/death dates of authors if they don't have them already. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2011-10-20 initial version * @since 2011-10-20 */ class RepairFamousBorns { /** * pattern of quote with Born with two dates. Ignore quoth generated quotations. */ private static final Pattern FAMOUS_DEAD_PATTERN = Pattern.compile( "([^<]+)\\s*", Pattern.DOTALL ); /** * pattern of quote with Born with one date. Ignore quoth generated quotations. . */ private static final Pattern FAMOUS_LIVING_PATTERN = Pattern.compile( "([^<]+)\\s*", Pattern.DOTALL ); /** * pattern of quote already handled, used to extract author */ private static final Pattern MULTI_AUTHOR_SPLIT = Pattern.compile( "\\s+and\\s+|\\s+with\\s+|\\s+aka\\s+|,|&|\\[|\\]" ); /** * total count of corrections. Reported elsewhere */ static int corrected; /** * repair all dead borns on page. * * @param big the whole page * @param fileBeingProcessed file we are processing * * @return corrected page. */ private static String repairDeadBornsForPage( final String big, final File fileBeingProcessed ) { // do not change to StringBuilder final StringBuffer sb = new StringBuffer( big.length() * 2 ); final Matcher dead = FAMOUS_DEAD_PATTERN.matcher( big ); while ( dead.find() ) { final String replacement = repairOneBornMacro( dead.group( 1 ), dead.group( 2 ), dead.group( 3 ), dead.group( 4 ), fileBeingProcessed ); if ( replacement != null ) { dead.appendReplacement( sb, Matcher.quoteReplacement( replacement ) ); } } dead.appendTail( sb ); return sb.toString(); } /** * repair all living borns on page. * * @param big the whole page * @param fileBeingProcessed file we are processing * * @return corrected page. */ static String repairFamousBornsForPage( final String big, final File fileBeingProcessed ) { return repairDeadBornsForPage( repairLivingBornsForPage( big, fileBeingProcessed ), fileBeingProcessed ); } /** * repair all living borns on page. * * @param big the whole page * @param fileBeingProcessed file we are processing * * @return corrected page. */ private static String repairLivingBornsForPage( final String big, final File fileBeingProcessed ) { // do not change to StringBuilder final StringBuffer sb = new StringBuffer( big.length() * 2 ); final Matcher living = FAMOUS_LIVING_PATTERN.matcher( big ); // find names of authors and birth and death dates inside and outside Blockquotes while ( living.find() ) { final String replacement = repairOneBornMacro( living.group( 1 ), living.group( 2 ), living.group( 3 ), "", fileBeingProcessed ); if ( replacement != null ) { living.appendReplacement( sb, Matcher.quoteReplacement( replacement ) ); } } living.appendTail( sb ); return sb.toString(); } /** * Replace missing or non-matching date * * @param saidOrFamous string "said" or "famous" which type of author * @param allAuthors author * @param birthdate unknown or yyyy-mm-dd * @param deathdate "" or yyyy-mm-dd * @param fileBeingProcessed file we are processing * * @return replacement string or null if no change needed. */ private static String repairOneBornMacro( String saidOrFamous, String allAuthors, final String birthdate, final String deathdate, final File fileBeingProcessed ) { allAuthors = DeEntifyStrings.stripHTMLTags( allAuthors ); final String[] individualAuthors = MULTI_AUTHOR_SPLIT.split( allAuthors ); final String author = individualAuthors.length >= 1 ? individualAuthors[ 0 ].trim() : ""; final String authorFlat = DeEntifyStrings.deEntifyHTML( author, ' ' ); final AuthorBio old = AuthorBio.get( authorFlat ); final AuthorBio recent = new AuthorBio( author, birthdate, deathdate, fileBeingProcessed ); // we don't keep that AuthorBio object unless it is the first for this author. if ( old != null && !( old.getBirthdate().equals( birthdate ) && old.getDeathdate().equals( deathdate ) ) ) { corrected++; AuthorWarning.warn( "inconsistent birth/death dates. Attempting correction.", old, recent ); final FastCat sb = new FastCat( 10 ); sb.append( "" ); sb.append( allAuthors ); sb.append( "" ); sb.append( " " ); // leave out checked. It is not trusted now. return sb.toString(); } else { return null; } } // no main }