/* * [SeeSort.java] * * Summary: Sorts and repairs HTML references. * * Copyright: (c) 1997-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.0 2005-06-16 split off from Compactor * 2.1 2005-06-16 use * com.mindprod.entities.stripHTMLTags and stripEntities instead of an a * crude TidyKey to fix sort problem. futures: catch dup when it is * created. warning if have a dup in the alpha part of a link even if * link differs. * 2.2 2006-03-05 reformat with IntelliJ and add Javadoc. * 2.3 2009-04-14 automatically correct missing
between links. treat
  • and
  • and separators too * 2.4 2014-04-16 change marker to
    * 2.5 2014-05-29 new way of doing dup detection that ignores class. * 2.6 2016-06-04 now use Configuration instead of command line */ package com.mindprod.seesort; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.fastcat.FastCat; import com.mindprod.filter.ExtensionListFilter; import com.mindprod.go.GoList; import com.mindprod.htmlmacros.macro.Global; import com.mindprod.hunkio.HunkIO; import org.jetbrains.annotations.NotNull; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.regex.Pattern; import static com.mindprod.entities.DeEntifyStrings.flattenHTML; import static java.lang.System.*; /** * Sorts and repairs HTML references. *

    * sort links in alphabetical order. Only works on directories for which there is an index(or a dummy empty index). * Removes duplicates. Must match url and name, ignore case. * * @author Roedy Green, Canadian Mind Products * @version 2.6 2016-06-04 now use Configuration instead of command line * @see com.mindprod.sortaliases.SortAliases * @since 1997 */ public final class SeeSort { // declarations private static final int FIRST_COPYRIGHT_YEAR = 1997; private static final boolean DEBUGGING = false; private static final int endMarkerLength = "

    ".length(); /** * ideal splitter, one we tidy to.
    will end on end of line, not on a line by itself */ private static final String CANONICAL_BR = "
    \n"; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 1997-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String RELEASE_DATE = "2016-06-04"; /** * how to use the command line */ private static final String USAGE = "\nSeeSort needs the name of a configuration on the command line."; /** * embedded version string. */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String VERSION_STRING = "2.6"; private static final String endMarker = ""; private static final String startMarker = "
    "; private static final String startMarker2 = ">"; /** * used to sort see items in order. */ private static final Comparator tidyComparator = new SeeLine(); private static final Pattern LINE_SPLITTER = Pattern .compile( "
    |
    |
    |

    |

  • |
    |

    |
  • ", Pattern.CASE_INSENSITIVE ); /** * list of known words and links */ private static GoList goList; /** * keyed lookup back to the GoList */ private static HashMap lookup; // /declarations /** * Constructor */ private SeeSort() { } // methods /** * informational message * * @param message text of error message. * @param fileBeingProcessed file were trouble was * @param near word in file where trouble was near. */ private static void info( final String message, final File fileBeingProcessed, final String near ) { final FastCat sb = new FastCat( 7 ); sb.append( "\n Note: ", message, " in file: ", EIO.getCanOrAbsPath( fileBeingProcessed ) ); if ( near != null ) { sb.append( " near: ", near ); } sb.append( "\n" ); err.println( sb.toString() ); }// /method private static void readForSeeSort( File serFile ) { // read forseesort.ser. Use to repair links without URLs. if ( serFile.exists() ) { try { final InputStream is = new FileInputStream( serFile ); goList = GoList.getData( is ); // index the golist int size = goList.size(); assert size < 20000 : "giant index " + size; lookup = new HashMap<>( size * 3 / 2 ); // add human names from index to hashMap so we can see if they are // fixable. Lookup takes name of link and gets you index into goList for details. for ( int i = 0; i < size; i++ ) { lookup.put( goList.getHumanname( i ).toLowerCase() /* key */, i /* value */ ); } } catch ( Exception e ) { err.println( "Exception fetching " + EIO.getCanOrAbsPath( serFile ) ); err.println( e.getMessage() ); exit( 1 ); } } else { goList = null; lookup = null; } } /** * find see-style references, sort each group, and repair missing links on one entire file. * * @param fileBeingProcessed the file currently being processed. * * @throws IOException if cannot read file */ @SuppressWarnings( { "JavaDoc" } ) private static void sortLinksInFile( File fileBeingProcessed ) throws IOException { String big = HunkIO.readEntireFile( fileBeingProcessed ); String result = sortLinksInString( big, fileBeingProcessed ); if ( result.equals( big ) ) { // nothing changed. No need to write results. return; } // generate output into a temporary file until we are sure all is ok. // create a temp file in the same directory as filename // create a tempFile in the same directory as // the input file we have just processed. final File tempFile = HunkIO.createTempFile( "temp_", ".tmp", fileBeingProcessed ); FileWriter emit = new FileWriter( tempFile ); emit.write( result ); emit.close(); // successfully created output in same directory as input, // Now make it replace the input file. HunkIO.deleteAndRename( tempFile, fileBeingProcessed ); }// /method /** * sort and fix groups of see references in this string representing an entire file * * @param big entire file contents containing
    links * @param fileBeingProcessed the file currently being processed. * * @return same string with links repaired, sorted, and tidied. */ private static String sortLinksInString( String big, File fileBeingProcessed ) { final String fileBeingProcessedName = fileBeingProcessed.getName(); final int originalLength = big.length(); final StringBuilder sb = new StringBuilder( originalLength + 1000 ); int i = 0; // i indexes char in the big string, goes in hops. // We might find several
    or
    final int startMarkerLength; String middle = big.substring( start + startMarker.length() ); if ( middle.startsWith( startMarker1 ) ) { startMarkerLength = startMarker.length() + startMarker1.length(); } else if ( middle.startsWith( startMarker2 ) ) { startMarkerLength = startMarker.length() + startMarker2.length(); } else { throw new IllegalArgumentException( "Invalid syntax " + big.substring( start, start + 30 ) + "\n Should be
    or
    " ); } // copy over stuff in front of the . From then an are . final int end = big.indexOf( endMarker, start + startMarkerLength ); if ( end < 0 ) { // we don't just print a warning. We must abort processing // this file. We could // do it serious damage. throw new IllegalArgumentException( "Unmatched " + startMarker + " ... " + endMarker ); } // we will soon handle all between, so mark it handled now. i = end + endMarkerLength; // contents of
    final String see = big.substring( start + startMarkerLength, end ); // the Start marker has already been appended. // Split contents of see division into lines at
    // Then slit lines with two entries into multiple lines. final String[] lines = splitMoreFinely( LINE_SPLITTER.split( see ) ); // put into alpha order by lines so can detect dups Arrays.sort( lines, tidyComparator ); // used to detect duplicates Comparator compareDupSeeLines = new CompareDupSeeLines(); String prev = ""; boolean needSeparator = false; for ( String line : lines ) { if ( line == null || line.length() == 0 ) { // ignore continue; } // line might contain embedded \n which we leave intact. // trim lead trail spaces, \n, Collapse double spaces, // control chars. // 'line' is a line, a phrase, or a link to somewhere, // -- the contents between <br> line = ST.condense( line ); // remove trailing dots. while ( line.endsWith( "." ) ) { line = line.substring( 0, line.length() - 1 ).trim(); } // ignore empty lines. if ( line.length() == 0 ) { continue; } if ( compareDupSeeLines.compare( line, prev ) == 0 ) { // Eliminate dup, ignore this one. // Both link and text describing link must match to be considered a dup info( line + " duplicate removed", fileBeingProcessed, null ); // toss the second one. continue; } if ( needSeparator ) { // put each on own line, easier to proofread // with <br> separator sb.append( CANONICAL_BR ); } else { // on all but the first, we need a separator in front. needSeparator = true; } // if has double or double " ) ) { warn( "extraneous \".html\"", fileBeingProcessed, line ); } if ( line.contains( "." ) ) { // Dot is ok, just not right after . We are enforcing consistent style here warn( "extraneous trailing dot after ", fileBeingProcessed, line ); } if ( line.contains( "," ) ) { // comma is ok, just not right after . We are enforcing consistent style here warn( "extraneous trailing comma after . Use : instead", fileBeingProcessed, line ); } // TODO: should put code in here to deal with class="xxx" if ( line.startsWith( " suppress a second // one, still might get one extra needSeparator = false; } else { // append everything ... sb.append( line ); } } else if ( line.startsWith( "