/*
* [SeeSort.java]
*
* Summary: Sorts and repairs HTML references.
*
* Copyright: (c) 1997-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 2.0 2005-06-16 split off from Compactor
* 2.1 2005-06-16 use
* com.mindprod.entities.stripHTMLTags and stripEntities instead of an a
* crude TidyKey to fix sort problem. futures: catch dup when it is
* created. warning if have a dup in the alpha part of a link even if
* link differs.
* 2.2 2006-03-05 reformat with IntelliJ and add Javadoc.
* 2.3 2009-04-14 automatically correct missing
between links. treat
and and separators too
* 2.4 2014-04-16 change marker to
* 2.5 2014-05-29 new way of doing dup detection that ignores class.
* 2.6 2016-06-04 now use Configuration instead of command line
*/
package com.mindprod.seesort;
import com.mindprod.common18.EIO;
import com.mindprod.common18.ST;
import com.mindprod.entities.DeEntifyStrings;
import com.mindprod.fastcat.FastCat;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.go.GoList;
import com.mindprod.htmlmacros.macro.Global;
import com.mindprod.hunkio.HunkIO;
import org.jetbrains.annotations.NotNull;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import static com.mindprod.entities.DeEntifyStrings.flattenHTML;
import static java.lang.System.*;
/**
* Sorts and repairs HTML references.
*
* sort links in alphabetical order. Only works on directories for which there is an index(or a dummy empty index).
* Removes duplicates. Must match url and name, ignore case.
*
* @author Roedy Green, Canadian Mind Products
* @version 2.6 2016-06-04 now use Configuration instead of command line
* @see com.mindprod.sortaliases.SortAliases
* @since 1997
*/
public final class SeeSort
{
// declarations
private static final int FIRST_COPYRIGHT_YEAR = 1997;
private static final boolean DEBUGGING = false;
private static final int endMarkerLength = "
".length();
/**
* ideal splitter, one we tidy to.
will end on end of line, not on a line by itself
*/
private static final String CANONICAL_BR = "
\n";
/**
* undisplayed copyright notice
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String EMBEDDED_COPYRIGHT =
"Copyright: (c) 1997-2017 Roedy Green, Canadian Mind Products, http://mindprod.com";
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String RELEASE_DATE = "2016-06-04";
/**
* how to use the command line
*/
private static final String USAGE = "\nSeeSort needs the name of a configuration on the command line.";
/**
* embedded version string.
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String VERSION_STRING = "2.6";
private static final String endMarker = "";
private static final String startMarker = "";
private static final String startMarker2 = ">";
/**
* used to sort see items in order.
*/
private static final Comparator
tidyComparator = new SeeLine();
private static final Pattern LINE_SPLITTER = Pattern
.compile( "
|
|
||
|||", Pattern.CASE_INSENSITIVE );
/**
* list of known words and links
*/
private static GoList goList;
/**
* keyed lookup back to the GoList
*/
private static HashMap lookup;
// /declarations
/**
* Constructor
*/
private SeeSort()
{
}
// methods
/**
* informational message
*
* @param message text of error message.
* @param fileBeingProcessed file were trouble was
* @param near word in file where trouble was near.
*/
private static void info( final String message, final File fileBeingProcessed, final String near )
{
final FastCat sb = new FastCat( 7 );
sb.append( "\n Note: ", message, " in file: ", EIO.getCanOrAbsPath( fileBeingProcessed ) );
if ( near != null )
{
sb.append( " near: ", near );
}
sb.append( "\n" );
err.println( sb.toString() );
}// /method
private static void readForSeeSort( File serFile )
{
// read forseesort.ser. Use to repair links without URLs.
if ( serFile.exists() )
{
try
{
final InputStream is = new FileInputStream( serFile );
goList = GoList.getData( is );
// index the golist
int size = goList.size();
assert size < 20000 : "giant index " + size;
lookup = new HashMap<>( size * 3 / 2 );
// add human names from index to hashMap so we can see if they are
// fixable. Lookup takes name of link and gets you index into goList for details.
for ( int i = 0; i < size; i++ )
{
lookup.put( goList.getHumanname( i ).toLowerCase() /* key */, i /* value */ );
}
}
catch ( Exception e )
{
err.println( "Exception fetching " + EIO.getCanOrAbsPath( serFile ) );
err.println( e.getMessage() );
exit( 1 );
}
}
else
{
goList = null;
lookup = null;
}
}
/**
* find see-style references, sort each group, and repair missing links on one entire file.
*
* @param fileBeingProcessed the file currently being processed.
*
* @throws IOException if cannot read file
*/
@SuppressWarnings( { "JavaDoc" } )
private static void sortLinksInFile( File fileBeingProcessed ) throws IOException
{
String big = HunkIO.readEntireFile( fileBeingProcessed );
String result = sortLinksInString( big, fileBeingProcessed );
if ( result.equals( big ) )
{
// nothing changed. No need to write results.
return;
}
// generate output into a temporary file until we are sure all is ok.
// create a temp file in the same directory as filename
// create a tempFile in the same directory as
// the input file we have just processed.
final File tempFile =
HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed );
FileWriter emit = new FileWriter( tempFile );
emit.write( result );
emit.close();
// successfully created output in same directory as input,
// Now make it replace the input file.
HunkIO.deleteAndRename( tempFile, fileBeingProcessed );
}// /method
/**
* sort and fix groups of see references in this string representing an entire file
*
* @param big entire file contents containing links
* @param fileBeingProcessed the file currently being processed.
*
* @return same string with links repaired, sorted, and tidied.
*/
private static String sortLinksInString( String big, File fileBeingProcessed )
{
final String fileBeingProcessedName = fileBeingProcessed.getName();
final int originalLength = big.length();
final StringBuilder sb = new StringBuilder( originalLength + 1000 );
int i = 0;
// i indexes char in the big string, goes in hops.
// We might find several
or
final int startMarkerLength;
String middle = big.substring( start + startMarker.length() );
if ( middle.startsWith( startMarker1 ) )
{
startMarkerLength = startMarker.length() + startMarker1.length();
}
else if ( middle.startsWith( startMarker2 ) )
{
startMarkerLength = startMarker.length() + startMarker2.length();
}
else
{
throw new IllegalArgumentException( "Invalid syntax "
+ big.substring( start, start + 30 )
+ "\n Should be
or
" );
}
// copy over stuff in front of the
. From then an are
.
final int end = big.indexOf( endMarker, start + startMarkerLength );
if ( end < 0 )
{
// we don't just print a warning. We must abort processing
// this file. We could
// do it serious damage.
throw new IllegalArgumentException( "Unmatched "
+ startMarker
+ " ... "
+ endMarker );
}
// we will soon handle all between, so mark it handled now.
i = end + endMarkerLength;
// contents of
final String see = big.substring( start + startMarkerLength, end );
// the Start marker has already been appended.
// Split contents of see division into lines at
// Then slit lines with two entries into multiple lines.
final String[] lines = splitMoreFinely( LINE_SPLITTER.split( see ) );
// put into alpha order by lines so can detect dups
Arrays.sort( lines, tidyComparator );
// used to detect duplicates
Comparator
compareDupSeeLines = new CompareDupSeeLines();
String prev = "";
boolean needSeparator = false;
for ( String line : lines )
{
if ( line == null || line.length() == 0 )
{
// ignore
continue;
}
// line might contain embedded \n which we leave intact.
// trim lead trail spaces, \n, Collapse double spaces,
// control chars.
// 'line' is a line, a phrase, or a link to somewhere,
// -- the contents between <br>
line = ST.condense( line );
// remove trailing dots.
while ( line.endsWith( "." ) )
{
line = line.substring( 0, line.length() - 1 ).trim();
}
// ignore empty lines.
if ( line.length() == 0 )
{
continue;
}
if ( compareDupSeeLines.compare( line, prev ) == 0 )
{
// Eliminate dup, ignore this one.
// Both link and text describing link must match to be considered a dup
info( line + " duplicate removed", fileBeingProcessed, null );
// toss the second one.
continue;
}
if ( needSeparator )
{
// put each on own line, easier to proofread
// with <br> separator
sb.append( CANONICAL_BR );
}
else
{
// on all but the first, we need a separator in front.
needSeparator = true;
}
// if has double or double " ) )
{
warn( "extraneous \".html\"", fileBeingProcessed, line );
}
if ( line.contains( "." ) )
{
// Dot is ok, just not right after . We are enforcing consistent style here
warn( "extraneous trailing dot after ", fileBeingProcessed, line );
}
if ( line.contains( "," ) )
{
// comma is ok, just not right after . We are enforcing consistent style here
warn( "extraneous trailing comma after . Use : instead", fileBeingProcessed, line );
}
// TODO: should put code in here to deal with class="xxx"
if ( line.startsWith( " suppress a second
// one, still might get one extra
needSeparator = false;
}
else
{
// append everything ...
sb.append( line );
}
}
else if ( line.startsWith( "