/* * [TidyOCR.java] * * Summary: Tidy up raw OCR txt and convert to html. * * Copyright: (c) 2013-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2013-03-03 initial version */ package com.mindprod.repair; import com.mindprod.commandline.CommandLine; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.AllButSVNDirectoriesFilter; import com.mindprod.filter.ExtensionListFilter; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Tidy up raw OCR txt and convert to html. *

* Add

around each line * convert ' to ’ * convert quotes to a span * convert - to — * collapse lines inte paragraphs. * Capitalise first word of each sentence. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2013-03-03 initial version * @since 2013-03-03 */ public class TidyOCR { /** * year first released */ private static final int FIRST_COPYRIGHT_YEAR = 2013; /** * CSS class to apply to quoted phrases */ private static final String CSS_FOR_QUOTED_PHRASE = "quoted"; /** * CSS class to apply to quoted words */ private static final String CSS_FOR_QUOTED_WORD = "scare"; /** * undisplayed copyright notice. * * @noinspection UnusedDeclaration */ private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2013-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; /** * date this version released. * * @noinspection UnusedDeclaration */ private static final String RELEASE_DATE = "2013-03-06"; /** * how to use the program */ private static final String USAGE = "\nTidyOCR requires a list of .txt files to process. -q means all " + "files in dirs. Results in *.html"; /** * embedded version string. * * @noinspection UnusedDeclaration */ private static final String VERSION_STRING = "1.1"; /** * finds sentence break */ private static final Pattern SENTENCE_FINDER = Pattern.compile( "([\\.\\?\\!])\\s+([a-z])" /* case-sensitive */ ); /** * find line break in middle of paragraph */ private static Pattern BAD_BREAK_FINDER = Pattern.compile( "(?:\\r\\n|\\n)\\s*([a-z])" /* case-sensitive */ ); /** * find line break in middle of paragraph */ private static Pattern LINE_SPLITTER = Pattern.compile( "\\r\\n|\\n" ); /** * apply

markers * * @param contents text provided by scanner * * @return one big string with

around each paragraph */ private static String applyParagraphMarkers( String contents ) { String[] lines = LINE_SPLITTER.split( contents ); final FastCat sb = new FastCat( lines.length * 3 ); for ( String line : lines ) { sb.append( "

" ); sb.append( line.trim() ); sb.append( "

\n" ); } return sb.toString(); } /** * read xxx.txt and create xxx.html * * @param file txt file to tidy up. * * @throws IOException */ private static void convertOneFile( File file ) throws IOException { // R E A D String contents = HunkIO.readEntireFile( file, HunkIO.UTF8 ); // P R O C E S S contents = fixEntities( contents ); contents = fixSentences( contents ); contents = fixParagraphBreaks( contents ); contents = fixQuotes( contents ); contents = applyParagraphMarkers( contents ); // W R I T E String name = EIO.getCanOrAbsPath( file ); String htmlname = ST.chopTrailingString( name, ".txt" ) + ".html"; HunkIO.writeEntireFile( new File( htmlname ), contents, HunkIO.UTF8 ); } /** * converts chars used as mdash by scanner software to — * * @param contents of file * * @return entified contents */ private static String fixEntities( String contents ) { contents = contents.replaceAll( "[\t ]+", " " ); contents = contents.replaceAll( "\\.{3,}", "…" ); contents = contents.replaceAll( " i ", " I " ); contents = contents.replaceAll( "'", "’" ); contents = contents.replaceAll( "\\-\\-", " — " ); contents = contents.replaceAll( "\u2014", " — " ); contents = contents.replaceAll( "\ufffd", " — " ); return contents; } /** * normally a line is a paragraph, but sometimes line is line. * Possibly glue onto previous line to ensure each line is a paragraph. * * @param contents from scanner * * @return contents with some paragraph breaks removed. */ private static String fixParagraphBreaks( String contents ) { // do not change to StringBuilder final StringBuffer sb = new StringBuffer( contents.length() ); final Matcher m = BAD_BREAK_FINDER.matcher( contents ); while ( m.find() ) { // drop line break m.appendReplacement( sb, " " + m.group( 1 ) ); } m.appendTail( sb ); return sb.toString(); } /** * tidy up one line of the original text file * * @param contents contents of file. * * @return contents with " paired and changed to span. */ private static String fixQuotes( String contents ) { // we don't let "s span paragraphs final String[] lines = LINE_SPLITTER.split( contents ); final FastCat sb = new FastCat( contents.length() / 5 ); for ( String line : lines ) { line = line.trim(); boolean outside = true; int start = 0; while ( start < line.length() ) { int place = line.indexOf( "\"", start ); if ( place >= 0 ) { if ( outside ) { // text prior to quote sb.append( line.substring( start, place ) ); outside = false; } else { final String quote = line.substring( start, place ); if ( quote.contains( " " ) ) { sb.append( "" ); } else { sb.append( "" ); } sb.append( quote ); sb.append( "" ); outside = true; } start = place + 1; } else { // text after last quote sb.append( line.substring( start ) ); start = line.length(); } } if ( !outside ) { err.println( "Unmatched quotes in " + line ); } sb.append( '\n' ); } return sb.toString(); } /** * Make sure sentences start with capital letter * * @param contents contents of file * * @return sentences start with capital letter */ private static String fixSentences( String contents ) { // do not change to StringBuilder final StringBuffer sb = new StringBuffer( contents.length() ); final Matcher m = SENTENCE_FINDER.matcher( contents ); // Matchers are used both for matching and finding. while ( m.find() ) { m.appendReplacement( sb, m.group( 1 ) + " " + Character.toUpperCase( m.group( 2 ).charAt( 0 ) ) ); } m.appendTail( sb ); return sb.toString(); } /** * tidy results of Omnipage 18 ready for HTML. * * @param args dirs and files to process */ public static void main( final String[] args ) { CommandLine commandLine = new CommandLine( args, new AllButSVNDirectoriesFilter(), new ExtensionListFilter( "txt" ) ); if ( commandLine.size() == 0 ) { throw new IllegalArgumentException( "No files found to process\n" + USAGE ); } for ( File file : commandLine ) { try { convertOneFile( file ); } catch ( FileNotFoundException e ) { out.println( "Error: " + EIO.getCanOrAbsPath( file ) + " not found." ); } catch ( Exception e ) { out.println( e.getMessage() + " in file " + EIO.getCanOrAbsPath( file ) ); } } // end for } }