/*
 * [TidyOCR.java]
 *
 * Summary: Tidy up raw OCR txt and convert to html.
 *
 * Copyright: (c) 2013-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
 *
 * Licence: This software may be copied and used freely for any purpose but military.
 *          http://mindprod.com/contact/nonmil.html
 *
 * Requires: JDK 1.8+
 *
 * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
 *
 * Version History:
 *  1.0 2013-03-03 initial version
 */
package com.mindprod.repair;

import com.mindprod.commandline.CommandLine;
import com.mindprod.common18.EIO;
import com.mindprod.common18.ST;
import com.mindprod.fastcat.FastCat;
import com.mindprod.filter.AllButSVNDirectoriesFilter;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.hunkio.HunkIO;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.lang.System.*;

/**
 * Tidy up raw OCR txt and convert to html.
 * <p/>
 * Add <p></p> around each line
 * convert ' to &rsquo;
 * convert quotes to a span
 * convert - to &mdash;
 * collapse lines inte paragraphs.
 * Capitalise first word of each sentence.
 *
 * @author Roedy Green, Canadian Mind Products
 * @version 1.0 2013-03-03 initial version
 * @since 2013-03-03
 */
public class TidyOCR
    {
    /**
     * year first released
     */
    private static final int FIRST_COPYRIGHT_YEAR = 2013;

    /**
     * CSS class to apply to quoted phrases
     */
    private static final String CSS_FOR_QUOTED_PHRASE = "quoted";

    /**
     * CSS class to apply to quoted words
     */
    private static final String CSS_FOR_QUOTED_WORD = "scare";

    /**
     * undisplayed copyright notice.
     *
     * @noinspection UnusedDeclaration
     */
    private static final String EMBEDDED_COPYRIGHT =
            "Copyright: (c) 2013-2017 Roedy Green, Canadian Mind Products, http://mindprod.com";

    /**
     * date this version released.
     *
     * @noinspection UnusedDeclaration
     */
    private static final String RELEASE_DATE = "2013-03-06";

    /**
     * how to use the program
     */
    private static final String USAGE = "\nTidyOCR requires a list of .txt files to process. -q means all " +
                                        "files in dirs. Results in *.html";

    /**
     * embedded version string.
     *
     * @noinspection UnusedDeclaration
     */
    private static final String VERSION_STRING = "1.1";

    /**
     * finds sentence break
     */
    private static final Pattern SENTENCE_FINDER = Pattern.compile( "([\\.\\?\\!])\\s+([a-z])" /* case-sensitive */ );

    /**
     * find line break in middle of paragraph
     */
    private static Pattern BAD_BREAK_FINDER = Pattern.compile( "(?:\\r\\n|\\n)\\s*([a-z])" /* case-sensitive */ );

    /**
     * find line break in middle of paragraph
     */
    private static Pattern LINE_SPLITTER = Pattern.compile( "\\r\\n|\\n" );

    /**
     * apply <p></p> markers
     *
     * @param contents text provided by scanner
     *
     * @return one big string with <p></p> around each paragraph
     */
    private static String applyParagraphMarkers( String contents )
        {
        String[] lines = LINE_SPLITTER.split( contents );
        final FastCat sb = new FastCat( lines.length * 3 );
        for ( String line : lines )
            {
            sb.append( "<p>" );
            sb.append( line.trim() );
            sb.append( "</p>\n" );
            }
        return sb.toString();
        }

    /**
     * read xxx.txt and create xxx.html
     *
     * @param file txt file to tidy up.
     *
     * @throws IOException
     */
    private static void convertOneFile( File file ) throws IOException
        {
        // R E A D
        String contents = HunkIO.readEntireFile( file, HunkIO.UTF8 );
        // P R O C E S S
        contents = fixEntities( contents );
        contents = fixSentences( contents );
        contents = fixParagraphBreaks( contents );
        contents = fixQuotes( contents );
        contents = applyParagraphMarkers( contents );
        // W R I T E
        String name = EIO.getCanOrAbsPath( file );
        String htmlname = ST.chopTrailingString( name, ".txt" ) + ".html";
        HunkIO.writeEntireFile( new File( htmlname ), contents, HunkIO.UTF8 );
        }

    /**
     * converts chars used as mdash by scanner software to &mdash;
     *
     * @param contents of file
     *
     * @return entified contents
     */
    private static String fixEntities( String contents )
        {
        contents = contents.replaceAll( "[\t ]+", " " );
        contents = contents.replaceAll( "\\.{3,}", "&hellip;" );
        contents = contents.replaceAll( " i ", " I " );
        contents = contents.replaceAll( "'", "&rsquo;" );
        contents = contents.replaceAll( "\\-\\-", " &mdash; " );
        contents = contents.replaceAll( "\u2014", " &mdash; " );
        contents = contents.replaceAll( "\ufffd", " &mdash; " );
        return contents;
        }

    /**
     * normally a line is a paragraph, but sometimes line is line.
     * Possibly glue onto previous line to ensure each line is a paragraph.
     *
     * @param contents from scanner
     *
     * @return contents with some paragraph breaks removed.
     */
    private static String fixParagraphBreaks( String contents )
        {
        // do not change to StringBuilder
        final StringBuffer sb = new StringBuffer( contents.length() );
        final Matcher m = BAD_BREAK_FINDER.matcher( contents );
        while ( m.find() )
            {
            // drop line break
            m.appendReplacement( sb, " " + m.group( 1 ) );
            }
        m.appendTail( sb );
        return sb.toString();
        }

    /**
     * tidy up one line of the original text file
     *
     * @param contents contents of file.
     *
     * @return contents with " paired and changed to span.
     */
    private static String fixQuotes( String contents )
        {
        // we don't let "s span paragraphs
        final String[] lines = LINE_SPLITTER.split( contents );
        final FastCat sb = new FastCat( contents.length() / 5 );
        for ( String line : lines )
            {
            line = line.trim();
            boolean outside = true;
            int start = 0;
            while ( start < line.length() )
                {
                int place = line.indexOf( "\"", start );
                if ( place >= 0 )
                    {
                    if ( outside )
                        {
                        // text prior to quote
                        sb.append( line.substring( start, place ) );
                        outside = false;
                        }
                    else
                        {
                        final String quote = line.substring( start, place );
                        if ( quote.contains( " " ) )
                            {
                            sb.append( "<span class=\"" +
                                       CSS_FOR_QUOTED_PHRASE +
                                       "\">" );
                            }
                        else
                            {
                            sb.append( "<span class=\"" +
                                       CSS_FOR_QUOTED_WORD +
                                       "\">" );
                            }
                        sb.append( quote );
                        sb.append( "</span>" );
                        outside = true;
                        }
                    start = place + 1;
                    }
                else
                    {
                    // text after last quote
                    sb.append( line.substring( start ) );
                    start = line.length();
                    }
                }
            if ( !outside )
                {
                err.println( "Unmatched quotes in " + line );
                }
            sb.append( '\n' );
            }
        return sb.toString();
        }

    /**
     * Make sure sentences start with capital letter
     *
     * @param contents contents of file
     *
     * @return sentences start with capital letter
     */
    private static String fixSentences( String contents )
        {
        // do not change to StringBuilder
        final StringBuffer sb = new StringBuffer( contents.length() );
        final Matcher m = SENTENCE_FINDER.matcher( contents );  // Matchers are used both for matching and finding.
        while ( m.find() )
            {
            m.appendReplacement( sb, m.group( 1 ) + " " + Character.toUpperCase( m.group( 2 ).charAt( 0 ) ) );
            }
        m.appendTail( sb );
        return sb.toString();
        }

    /**
     * tidy results of Omnipage 18 ready for HTML.
     *
     * @param args dirs and files to process
     */
    public static void main( final String[] args )
        {
        CommandLine commandLine = new CommandLine( args,
                new AllButSVNDirectoriesFilter(),
                new ExtensionListFilter( "txt" ) );
        if ( commandLine.size() == 0 )
            {
            throw new IllegalArgumentException( "No files found to process\n" + USAGE );
            }
        for ( File file : commandLine )
            {
            try
                {
                convertOneFile( file );
                }
            catch ( FileNotFoundException e )
                {
                out.println( "Error: "
                             + EIO.getCanOrAbsPath( file )
                             + " not found." );
                }
            catch ( Exception e )
                {
                out.println( e.getMessage()
                             + " in file "
                             + EIO.getCanOrAbsPath( file ) );
                }
            } // end for
        }
    }