* 2.5 2014-05-29 new way of doing dup detection that ignores class. * 2.6 2016-06-04 now use Configuration instead of command line */ package com.mindprod.seesort; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.ExtensionListFilter; import com.mindprod.go.GoList; import com.mindprod.htmlmacros.macro.Global; import com.mindprod.hunkio.HunkIO; import org.jetbrains.annotations.NotNull; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; import static com.mindprod.entities.DeEntifyStrings.flattenHTML; import static java.lang.System.*; /** * Sorts and repairs HTML references. *

* sort links in alphabetical order. Only works on directories for which there is an index(or a dummy empty index). * Removes duplicates. Must match url and name, ignore case. * * @author Roedy Green, Canadian Mind Products * @version 2.6 2016-06-04 now use Configuration instead of command line * @see com.mindprod.sortaliases.SortAliases * @since 1997 */ public final class SeeSort { // declarations private static final int FIRST_COPYRIGHT_YEAR = 1997; private static final boolean DEBUGGING = false; private static final int endMarkerLength = "

"; private static final String startMarker2 = ">"; /** * used to sort see items in order. */ private static final Comparator tidyComparator = new SeeLine(); private static final Pattern LINE_SPLITTER = Pattern .compile( "
|
|
|

|
|

", Pattern.CASE_INSENSITIVE ); /** * list of known words and links */ private static GoList goList; /** * keyed lookup back to the GoList */ private static HashMap lookup; // /declarations /** * Constructor */ private SeeSort() { } // methods /** * informational message * * @param message text of error message. * @param fileBeingProcessed file were trouble was * @param near word in file where trouble was near. */ private static void info( final String message, final File fileBeingProcessed, final String near ) { final FastCat sb = new FastCat( 7 ); sb.append( "\n Note: ", message, " in file: ", EIO.getCanOrAbsPath( fileBeingProcessed ) ); if ( near != null ) { sb.append( " near: ", near ); } sb.append( "\n" ); err.println( sb.toString() ); }// /method private static void readForSeeSort( File serFile ) { // read forseesort.ser. Use to repair links without URLs. if ( serFile.exists() ) { try { final InputStream is = new FileInputStream( serFile ); goList = GoList.getData( is ); // index the golist int size = goList.size(); assert size < 20000 : "giant index " + size; lookup = new HashMap<>( size * 3 / 2 ); // add human names from index to hashMap so we can see if they are // fixable. Lookup takes name of link and gets you index into goList for details. for ( int i = 0; i < size; i++ ) { lookup.put( goList.getHumanname( i ).toLowerCase() /* key */, i /* value */ ); } } catch ( Exception e ) { err.println( "Exception fetching " + EIO.getCanOrAbsPath( serFile ) ); err.println( e.getMessage() ); exit( 1 ); } } else { goList = null; lookup = null; } } /** * find see-style references, sort each group, and repair missing links on one entire file. * * @param fileBeingProcessed the file currently being processed. * * @throws IOException if cannot read file */ @SuppressWarnings( { "JavaDoc" } ) private static void sortLinksInFile( File fileBeingProcessed ) throws IOException { String big = HunkIO.readEntireFile( fileBeingProcessed ); String result = sortLinksInString( big, fileBeingProcessed ); if ( result.equals( big ) ) { // nothing changed. No need to write results. return; } // generate output into a temporary file until we are sure all is ok. // create a temp file in the same directory as filename // create a tempFile in the same directory as // the input file we have just processed. final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed ); FileWriter emit = new FileWriter( tempFile ); emit.write( result ); emit.close(); // successfully created output in same directory as input, // Now make it replace the input file. HunkIO.deleteAndRename( tempFile, fileBeingProcessed ); }// /method /** * sort and fix groups of see references in this string representing an entire file * * @param big entire file contents containing

links * @param fileBeingProcessed the file currently being processed. * * @return same string with links repaired, sorted, and tidied. */ private static String sortLinksInString( String big, File fileBeingProcessed ) { final String fileBeingProcessedName = fileBeingProcessed.getName(); final int originalLength = big.length(); final StringBuilder sb = new StringBuilder( originalLength + 1000 ); int i = 0; // i indexes char in the big string, goes in hops. // We might find several

final int startMarkerLength; String middle = big.substring( start + startMarker.length() ); if ( middle.startsWith( startMarker1 ) ) { startMarkerLength = startMarker.length() + startMarker1.length(); } else if ( middle.startsWith( startMarker2 ) ) { startMarkerLength = startMarker.length() + startMarker2.length(); } else { throw new IllegalArgumentException( "Invalid syntax " + big.substring( start, start + 30 ) + "\n Should be

" ); } // copy over stuff in front of the . From then an are . final int end = big.indexOf( endMarker, start + startMarkerLength ); if ( end < 0 ) { // we don't just print a warning. We must abort processing // this file. We could // do it serious damage. throw new IllegalArgumentException( "Unmatched " + startMarker + " ... " + endMarker ); } // we will soon handle all between, so mark it handled now. i = end + endMarkerLength; // contents of

final String see = big.substring( start + startMarkerLength, end ); // the Start marker has already been appended. // Split contents of see division into lines at
// Then slit lines with two entries into multiple lines. final String[] lines = splitMoreFinely( LINE_SPLITTER.split( see ) ); // put into alpha order by lines so can detect dups Arrays.sort( lines, tidyComparator ); // used to detect duplicates Comparator compareDupSeeLines = new CompareDupSeeLines(); String prev = ""; boolean needSeparator = false; for ( String line : lines ) { if ( line == null || line.length() == 0 ) { // ignore continue; } // line might contain embedded \n which we leave intact. // trim lead trail spaces, \n, Collapse double spaces, // control chars. // 'line' is a line, a phrase, or a link to somewhere, // -- the contents between <br> line = ST.condense( line ); // remove trailing dots. while ( line.endsWith( "." ) ) { line = line.substring( 0, line.length() - 1 ).trim(); } // ignore empty lines. if ( line.length() == 0 ) { continue; } if ( compareDupSeeLines.compare( line, prev ) == 0 ) { // Eliminate dup, ignore this one. // Both link and text describing link must match to be considered a dup info( line + " duplicate removed", fileBeingProcessed, null ); // toss the second one. continue; } if ( needSeparator ) { // put each on own line, easier to proofread // with <br> separator sb.append( CANONICAL_BR ); } else { // on all but the first, we need a separator in front. needSeparator = true; } // if has double or double " ) ) { warn( "extraneous \".html\"", fileBeingProcessed, line ); } if ( line.contains( "." ) ) { // Dot is ok, just not right after . We are enforcing consistent style here warn( "extraneous trailing dot after ", fileBeingProcessed, line ); } if ( line.contains( "," ) ) { // comma is ok, just not right after . We are enforcing consistent style here warn( "extraneous trailing comma after . Use : instead", fileBeingProcessed, line ); } // TODO: should put code in here to deal with class="xxx" if ( line.startsWith( " suppress a second // one, still might get one extra needSeparator = false; } else { // append everything ... sb.append( line ); } } else if ( line.startsWith( "