/* * [BrokenLinks.java] * * Summary: tracks broken links detected by Xenu. * * Copyright: (c) 2008-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2008-07-21 initial version. * 1.1 2008-08-01 export all HTML in one lump, complete with headers and footers. * 1.2 2008-08-08 first version released to the public. No forgiveness on broken local links. * and text version of summary report. * 1.3 2008-10-29 handle empty URLs. * 1.4 2008-11-29 add suspectForgivenessDays * 1.5 2009-02-14 handle null status, properly delay reporting new broken links. * 1.6 2009-02-20 refactor to use new HTTP library * 1.7 2010-09-10 handle Xenu's new mailto checking. * 1.8 2010-11-14 primitive permanent redirect handling. * 1.9 2010-11-17 export of redirects working. Not yet polished. Add ReplaceURLs. * 2.0 2010-12-30 export permanent and temporary redirects separately. * 2.1 2010-12-31 correct bug in merging original and redirected urls. * 2.2 2011-01-16 and TidyURLs utility to clean up all the hrefs on your website. * 2.3 2011-02-08 handle https: redirects and faster https: link checking. * 2.4 2012-02-26 faster probing using 30 threads. Must delete history.bin to use new version. * 2.5 2012-02-28 faster probing of redirects with 30 threads. No longer need to sort/align results separately. * 2.6 2012-06-02 simpler configuration. Now uses fixed file names in the current directory. * Limit number of froms saved for speed/memory. * Don't put ignored items in list. * Faster lookup of status kind. * 2.7 2012-11-22 uses special version of Xenu that includes numeric status. * Faster. easier to configure new status codes. * 2.8 2012-11-28 faster handling of local links. * 2.9 2012-12-02 optional leave.csv file. * 3.0 2013-01-20 ensure responsecodes.ser is accessible. Clarify error messages. * 3.1 2017-03-15 try both with and without SNI on fail */ package com.mindprod.brokenlinks; import com.mindprod.amper.Amper; import com.mindprod.common18.Age; import com.mindprod.common18.BigDate; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.common18.Shuffle; import com.mindprod.common18.Twirler; import com.mindprod.csv.CSV; import com.mindprod.csv.CSVAlign; import com.mindprod.csv.CSVReader; import com.mindprod.csv.CSVSort; import com.mindprod.csv.CSVWriter; import com.mindprod.fastcat.FastCat; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.EOFException; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.net.CookieHandler; import java.net.CookieManager; import java.net.CookiePolicy; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import static java.lang.System.*; /** * tracks broken links detected by Xenu. * * @author Roedy Green, Canadian Mind Products * @version 3.1 2017-03-15 try both with and without SNI on fail * @since 2008-07-21 */ public class BrokenLinks { private static final boolean DEBUGGING = false; /** * pseudo return code to mark link BrokenLinks discovered with an empty url */ private static final int BROKEN_LINKS_EMPTY_URL = -17; /** * pseudo return code to mark link BrokenLinks discovered with a malformed url */ private static final int BROKEN_LINKS_MALFORMED_URL = -18; /** * pseudo return code to mark link BrokenLinks does not test */ private static final int BROKEN_LINKS_NOT_TESTED = -19; /** * code to mark presumed good items, pseudo return code */ private static final int BROKEN_LINKS_PRESUMED_GOOD = -20; private static final int FIRST_COPYRIGHT_YEAR = 2008; /** * layout version of the history file */ private static final int HISTORY_FILE_FORMAT_VERSION = 5; /** * max elements in a HashMap */ private static final int MAXIMUM_HASHMAP_CAPACITY = 1 << 30; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2008-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; /** * when current version released */ private static final String RELEASE_DATE = "2017-03-15"; private static final String DO_NOT_EDIT = "\n"; /** * version */ private static final String VERSION_STRING = "3.1"; /** * where we keep our history of good and broken links */ private static HashMap history; /** * just links that were permanently redirected after reprobe. * Extracted so they can be sorted. */ private static LinkInfo[] justPermanentlyRedirecteds; /** * just links that are still broken after reprobe, not counting redirects. * Extracted so they can be sorted. */ private static LinkInfo[] justSolidlyBrokens; /** * just links that are temporarily redirected after reprobe. * Extracted so they can be sorted. */ private static LinkInfo[] justTemporarilyRedirecteds; /** * list of files to leave as is, even though they may fail in Xenu or would in BrokenLinks */ private static ArrayList leaves; /** * list of files presumed good, even though they may fail in Xenu or would in BrokenLinks */ private static ArrayList presumedGoods; /** * convert URL to HTML link * * @param url url starting with http: https: or file: * @param cssClass css class of the link, null for none, e.g. "broken" * * @return HTML to link and display the URL */ private static String buildLink( String url, String cssClass ) { final boolean local = url.startsWith( Config.localWebsiteURL ); final FastCat sb = new FastCat( 14 ); sb.append( "" ); sb.append( url.substring( Config.localWebsiteURL.length() - 1/* keep lead / */ ) ); } else { sb.append( "href=\"" ); sb.append( url ); sb.append( "\">" ); sb.append( url ); } sb.append( "" ); return sb.toString(); } /** * chase to find where redirected, and export to CSV file, either permanent or temporary. * redirectsFile file where output will be written in CSV format. * This common code handles both temporary and permanent redirects. * * @param links array info about links to be chased * @param redirectsFile where to extract info about redirects too. * * @throws IOException if have trouble exporting redirected links. */ private static void chaseAndExportAnyRedirecteds( final LinkInfo[] links, final File redirectsFile ) throws IOException { // O P E N // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim final CSVWriter w = new CSVWriter( EIO.getPrintWriter( redirectsFile, 32 * 1024, EIO.UTF8 ), 0 /* minimal */, ',', '\"', '#', true ); final BlockingQueue queue = new LinkedBlockingQueue<>( RedirectProbe.TASK_POOL_SIZE ); final ThreadPoolExecutor pool = new ThreadPoolExecutor( RedirectProbe.THREAD_POOL_SIZE, RedirectProbe.THREAD_POOL_SIZE, 3, TimeUnit.SECONDS, queue, new RejectedRedirectProbeHandler() ); // shuffle so won't hammer any one site repeatedly. We sort results after the chaos of the 30 threads. Shuffle.shuffle( links ); for ( LinkInfo b : links ) { synchronized ( b ) { // links not redirected are handled in exportToHTML. // we do not permanently record the newLocation other that in the exported CSV file. final String originalLocation = b.getTo(); URL originalURL; try { originalURL = new URL( originalLocation ); } catch ( MalformedURLException e ) { // so screwed up, could not even parse url. out.println( "\n<><>Warning<><> malformed URL\n " + b + "\n" ); continue; // ignore it } final String protocol = originalURL.getProtocol().toLowerCase(); if ( protocol.equals( "http" ) || protocol.equals( "https" ) ) { final String host = originalURL.getHost(); if ( host == null || host.length() == 0 ) { out.println( "\n<><>Warning<><> no host in URL \n " + b + "\n" ); continue; } pool.execute( new RedirectProbe( originalURL, b, w ) ); } else { // not http: https: so ignore it } } } // end loop out.println( "all redirect probes submitted" ); pool.shutdown(); try { pool.awaitTermination( 5, TimeUnit.MINUTES ); } catch ( InterruptedException e ) { } // C L O S E w.close(); spacedTitle( "sorting and aligning redirects..." ); new CSVSort( redirectsFile, new int[] { 0, 1 }, // sort by original, new new char[] { 's', 's' }, // case-sensitive string new boolean[] { true, true }, // ascending ',', '\"', '#', CSV.UTF8 ); new CSVAlign( redirectsFile, ',', '\"', '#', false /* verbose */, CSV.UTF8 ); } /** * chase to find where redirected, and export to CSV file, either permanent or temporary. * redirectsFile file where output will be written in CSV format. * This common code handles both temporary and permanent redirects. * Does not use multiple threads * * @param links array info about links to be chased * @param redirectsFile where to extract info about redirects too. * * @throws IOException if have trouble exporting redirected links. */ private static void chaseAndExportAnyRedirectedsWithoutThreads( final LinkInfo[] links, final File redirectsFile ) throws IOException { // O P E N // writer, quoteLevel, separatorChar, quoteChar, commentChar, trim final CSVWriter w = new CSVWriter( EIO.getPrintWriter( redirectsFile, 32 * 1024, EIO.UTF8 ), 0 /* minimal */, ',', '\"', '#', true ); // shuffle so won't hammer any one site repeatedly. We sort results later Shuffle.shuffle( links ); for ( LinkInfo b : links ) { synchronized ( b ) { // links not redirected are handled in exportToHTML. // we do not permanently record the newLocation other that in the exported CSV file. final String originalLocation = b.getTo(); URL originalURL; try { originalURL = new URL( originalLocation ); } catch ( MalformedURLException e ) { // so screwed up, could not even parse url. out.println( "\n<><>Warning<><> malformed URL\n " + b + "\n" ); continue; // ignore it } final String protocol = originalURL.getProtocol().toLowerCase(); if ( protocol.equals( "http" ) || protocol.equals( "https" ) ) { final String host = originalURL.getHost(); if ( host == null || host.length() == 0 ) { out.println( "\n<><>Warning<><> no host in URL \n " + b + "\n" ); continue; } new RedirectProbe( originalURL, b, w ).run(); } else { // not http: https: so ignore it } } } // end loop out.println( "all redirect probes completed" ); // C L O S E w.close(); spacedTitle( "sorting and aligning redirects..." ); new CSVSort( redirectsFile, new int[] { 0, 1 }, // sort by original, new new char[] { 's', 's' }, // case-sensitive string new boolean[] { true, true }, // ascending ',', '\"', '#', CSV.UTF8 ); new CSVAlign( redirectsFile, ',', '\"', '#', false /* verbose */, CSV.UTF8 ); } /** * Export temporary redirects as a CSV file. * * @throws java.io.IOException if trouble writing. */ private static void chaseAndExportPermanentlyRedirecteds() throws IOException { spacedTitle( "chasing and exporting permanent redirects..." ); if ( DEBUGGING ) { chaseAndExportAnyRedirectedsWithoutThreads( justPermanentlyRedirecteds, Config.permanentRedirectsCSVFile ); } else { chaseAndExportAnyRedirecteds( justPermanentlyRedirecteds, Config.permanentRedirectsCSVFile ); } } /** * Export permanent redirects as a CSV file. * * @throws java.io.IOException if trouble writing. */ private static void chaseAndExportTemporarilyRedirecteds() throws IOException { spacedTitle( "chasing and exporting temporary redirects..." ); if ( DEBUGGING ) { chaseAndExportAnyRedirectedsWithoutThreads( justTemporarilyRedirecteds, Config.temporaryRedirectsCSVFile ); } else { chaseAndExportAnyRedirecteds( justTemporarilyRedirecteds, Config.temporaryRedirectsCSVFile ); } } /** * Count broken links as of this instant. * * @return count of links that are broken right now. */ private static int countNowBrokenLinks() { if ( Config.DEBUGGING ) { spacedTitle( "counting links broken right this moment..." ); } // remove links no longer is use int nowBrokenLinks = 0; for ( LinkInfo b : history.values() ) { if ( b.isNowBroken() ) { nowBrokenLinks++; } } return nowBrokenLinks; } /** * Count permanently redirected links * * @return count of permanently redirected links. */ private static int countPermanentlyRedirected() { if ( Config.DEBUGGING ) { spacedTitle( "counting permanently redirectedlinks..." ); } int permanentlyRedirected = 0; for ( LinkInfo b : history.values() ) { if ( b.isPermanentlyRedirected() ) { permanentlyRedirected++; } } return permanentlyRedirected; } /** * Count broken links that have been broken for at least forgiveness days. * * @return count of broken links. */ private static int countSolidlyBrokenLinks() { if ( Config.DEBUGGING ) { spacedTitle( "counting solidly broken links..." ); } int solidlyBrokenCount = 0; for ( LinkInfo b : history.values() ) { if ( b.isSolidlyBroken() ) { solidlyBrokenCount++; } } return solidlyBrokenCount; } /** * Count temporarily redirected links * * @return count of temporarily redirected links. */ private static int countTemporarilyRedirected() { if ( Config.DEBUGGING ) { spacedTitle( "counting temporarily redirectedlinks..." ); } int temporarilyRedirected = 0; for ( LinkInfo b : history.values() ) { if ( b.isTemporarilyRedirected() ) { temporarilyRedirected++; } } return temporarilyRedirected; } /** * display statistics * * @param emit PrintStream where to print the stats * @param xenuNowBrokenCount how many broken links Xenu recently found. * @param nowBrokenCount how many broken links left of those Xenu found after a reprobe. * @param solidlyBrokenCount how many broken links there for at least brokenForgiveness days. * @param permanentlyRedirectedCount how many links were redirected permanently exported for auto-correct. * @param temporarilyRedirectedCount how many links were redirected temporarily exported for auto-correct. */ private static void displayStats( PrintStream emit, int xenuNowBrokenCount, int nowBrokenCount, int solidlyBrokenCount, int permanentlyRedirectedCount, int temporarilyRedirectedCount ) { emit.println(); emit.println( " " + xenuNowBrokenCount + " links are broken right now according to Xenu." ); emit.println( " " + nowBrokenCount + " links are broken right now according to BrokenLinks." ); emit.println( " " + ( history.size() - solidlyBrokenCount ) + " good links being tracked." ); emit.println( " " + history.size() + " total links being tracked." ); emit.println( " " + leaves.size() + " links known bad that we are leaving as is for now." ); emit.println( " " + presumedGoods.size() + " links presumed good that Xenu fails." ); emit.println( ">>> " + solidlyBrokenCount + " links (excluding redirects) that have been broken for at least " + TimeUnit.MILLISECONDS.toDays( Config.brokenForgivenessMillis ) + " days." ); emit.println( ">>> " + permanentlyRedirectedCount + " links permanently redirected, " + "tracked and exported for auto-correct." ); emit.println( ">>> " + temporarilyRedirectedCount + " links temporarily redirected, " + "tracked and exported for auto-correct." ); emit.println(); } /** * export broken link as a csv file * * @throws java.io.FileNotFoundException if trouble saving the broken count in a file. * @throws java.io.IOException if trouble writing. */ private static void exportBrokenLinksToCsv() throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "exporting " + justSolidlyBrokens.length + " broken links to csv..." ); } // O P E N final CSVWriter w = new CSVWriter( EIO.getPrintWriter( Config.exportBrokenLinksToCSVFile, 32 * 1024, EIO.UTF8 ), 0 /* minimal */, ',', '\"', '#', true ); for ( LinkInfo b : justSolidlyBrokens ) { // suppress redirects, they are exported later for automatic repair // might be multiple froms. // W R I T E w.put( b.getStatus() ); // broken internal link may have file: or http:// prefix. Convert to webroot relative final String to = ST.chopLeadingString( b.getTo(), Config.localWebsiteURL ); w.put( to ); for ( String from : b.getFroms() ) { w.put( Config.toFileWebsitePrefix + from ); } w.nl(); } w.close(); } /** * export leaves HTML * * @param prw where to write */ private static void exportLeavesToHTML( final PrintWriter prw ) { // follow with leaves. prw.println( headForLeaves( leaves.size(), true ) ); if ( leaves.size() == 0 ) { prw.println( "" + "no leave as is links" + "" ); } else { for ( String leave : leaves ) { prw.println( "" + buildLink( leave, "failing" ) + "" ); } } prw.println( "" ); } /** * export presumed goods to HTML * * @param prw where to write */ private static void exportPresumedGoodsToHTML( final PrintWriter prw ) { // follow with good links. prw.println( headForPresumedGoods( presumedGoods.size(), true ) ); if ( presumedGoods.size() == 0 ) { prw.println( "" + "no presumed good links" + "" ); } else { for ( String good : presumedGoods ) { prw.println( "" + buildLink( good, "failing" ) + "" ); } } prw.println( "" ); } /** * export broken link to display as HTML on the website * * @throws java.io.FileNotFoundException if trouble saving the broken count in a file. * @throws java.io.IOException if trouble writing. */ private static void exportToHTML() throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "exporting " + justSolidlyBrokens.length + " broken links to html..." ); } // export the data as HTML // O P E N final PrintWriter prw = EIO.getPrintWriter( Config.brokenlinksHTMLFile, 10 * 1024, EIO.UTF8 ); extractBrokenLinksToHTML( prw ); exportLeavesToHTML( prw ); exportPresumedGoodsToHTML( prw ); // C L O S E prw.close(); if ( Config.DEBUGGING ) { spacedTitle( "correcting & to &..." ); } // fix ampersands in file converting to & as needed, suppress confusing output. Amper.ampifyFile( Config.brokenlinksHTMLFile, 0 ); Amper.ampifyFile( Config.brokenlinksHTMLFile, 0 ); } /** * export the broken links to HTML * * @param prw where to write */ private static void extractBrokenLinksToHTML( final PrintWriter prw ) { prw.println( headForBrokenLinks( justSolidlyBrokens.length, true ) ); if ( justSolidlyBrokens.length == 0 ) { prw.println( "" + "no broken links found" + "" ); } else { for ( LinkInfo b : justSolidlyBrokens ) { // suppress redirects, they are exported later for automatic repair // might be multiple froms. final List froms = b.getFroms(); final FastCat sb = new FastCat( froms.size() * 2 ); for ( String fromURL : froms ) { // reconstitute it back to a full URL. Build like will shrink it again. sb.append( buildLink( Config.localWebsiteURL + fromURL, "plain" ) ); // will apply appropriate // prefix as needed. sb.append( "
" ); } // chop the final
sb.drop(); final String formattedFroms = sb.toString(); // W R I T E final FastCat sb2 = new FastCat( 8 ); sb2.append( "" ); sb2.append( b.getTerseStatusMessage() ); sb2.append( "" ); sb2.append( buildLink( b.getTo(), "failing" ) ); sb2.append( " " ); sb2.append( formattedFroms ); sb2.append( "" ); sb2.append( "
" ); prw.println( sb2.toString() ); } } prw.println( "" ); } /** * extract permanently redirected from history. Probe already done. We just want subset to sort for chase and * export. * * @param permanentlyRedirectedCount count of how many permanent redirects there are */ private static void extractPermanentlyRedirecteds( int permanentlyRedirectedCount ) { if ( Config.DEBUGGING ) { spacedTitle( "extracting permanently redirected links..." ); } justPermanentlyRedirecteds = new LinkInfo[ permanentlyRedirectedCount ]; int i = 0; for ( LinkInfo b : history.values() ) { if ( b.isPermanentlyRedirected() ) { justPermanentlyRedirecteds[ i++ ] = b; } } } /** * extract just broken links from history. Probe already done. We just want subset to sort for export. * Don't include redirects. * * @param solidlyBrokenCount count of how many brokens there are */ private static void extractSolidlyBrokens( int solidlyBrokenCount ) { if ( Config.DEBUGGING ) { spacedTitle( "extracting broken links..." ); } justSolidlyBrokens = new LinkInfo[ solidlyBrokenCount ]; int i = 0; for ( LinkInfo b : history.values() ) { if ( b.isSolidlyBroken() ) { justSolidlyBrokens[ i++ ] = b; } } } /** * extract temporarily redirected from history. Probe already done. We just want subset to sort for chase and * export. * * @param temporarilyRedirectedCount count of how many temporary redirects there are */ private static void extractTemporarilyRedirecteds( int temporarilyRedirectedCount ) { if ( Config.DEBUGGING ) { spacedTitle( "extracting temporarily redirected links..." ); } justTemporarilyRedirecteds = new LinkInfo[ temporarilyRedirectedCount ]; int i = 0; for ( LinkInfo b : history.values() ) { if ( b.isTemporarilyRedirected() ) { justTemporarilyRedirecteds[ i++ ] = b; } } } /** * get header for broken links. * * @param solidlyBrokenCount how many broken links there were for at least forgiveness days * @param generateHTML true if should generate html * * @return html to go at the head of the broken links list */ private static String headForBrokenLinks( int solidlyBrokenCount, boolean generateHTML ) { if ( generateHTML ) { final FastCat sb = new FastCat( 23 ); sb.append( DO_NOT_EDIT ); sb.append( "

Broken Links Sorted by Error Code

\n" ); sb.append( "

There are " ); sb.append( solidlyBrokenCount ); sb.append( " links that have been broken for at least " ); sb.append( TimeUnit.MILLISECONDS.toDays( Config.brokenForgivenessMillis ) ); sb.append( " days yet to be fixed. Last revised: " ); sb.append( BigDate.localToday().toString() ); sb.append( "

\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); sb.append( "\n" ); return sb.toString(); } else { final FastCat sb = new FastCat( 8 ); sb.append( "B r o k e n L i n k s S o r t e d b y E r r o r C o d e\n\n" ); sb.append( "On " ); sb.append( BigDate.localToday().toString() ); sb.append( " there were " ); sb.append( solidlyBrokenCount ); sb.append( " links that have been broken for at least " ); sb.append( TimeUnit.MILLISECONDS.toDays( Config.brokenForgivenessMillis ) ); sb.append( " days." ); return sb.toString(); } } /** * header for presumed leaves * * @param leaveCount count of presumed good urls. * * @return generated HTML */ @SuppressWarnings( { "StringWithMistakes" } ) private static String headForLeaves( int leaveCount, boolean generateHTML ) { if ( generateHTML ) { final FastCat sb = new FastCat( 21 ); sb.append( DO_NOT_EDIT ); sb.append( "

Links to Leave As Is

\n" ); sb.append( "

The following links are known to be broken, but they are deliberately not being repaired " + "for " + "now.\n" ); sb.append( "

\n

" ); sb.append( "There are " ); sb.append( leaveCount ); sb.append( " links marked to be left as is. " + "Last revised: " ); sb.append( BigDate.localToday().toString() ); sb.append( "

\n" ); sb.append( "\n" ); sb.append( "
Broken Links by Status Code
Broken Links by Status Code
Status CodeLinks To
  Linked From
" + "\n" ); sb.append( "\n" ); sb.append( "\n" ); return sb.toString(); } else { final FastCat sb = new FastCat( 11 ); sb.append( "L i n k s T o L e a v e A s I s \n\n" ); sb.append( "The following links are known to be broken, but they are deliberately not being repaired for " + "now" + ".\n\n" ); sb.append( "On " ); sb.append( BigDate.localToday().toString() ); sb.append( " there were " ); sb.append( leaveCount ); sb.append( " links marked as leave as is for now.\n" ); return sb.toString(); } } /** * header for presumed goods * * @param presumedGoodCount count of presumed good urls. * * @return generated HTML */ @SuppressWarnings( { "StringWithMistakes" } ) private static String headForPresumedGoods( int presumedGoodCount, boolean generateHTML ) { if ( generateHTML ) { final FastCat sb = new FastCat( 21 ); sb.append( DO_NOT_EDIT ); sb.append( "

Links Presumed Good

\n" ); sb.append( "

Xenu claims the following links are broken, but they have been manually found to\n" ); sb.append( "be good. They should be manually rechecked from time to time. The problem may be\n" ); sb.append( "an unknown SSL certificate authority which needs to be OKed manually,\n" ); sb.append( "(a missing/unknown/uninstalled certificate root authority) or\n" ); sb.append( "it may be the website sends the data, but with not-found status.\n" ); sb.append( "

\n

" ); sb.append( "There are " ); sb.append( presumedGoodCount ); sb.append( " links marked as presumed good despite what Xenu says. " + "Last revised: " ); sb.append( BigDate.localToday().toString() ); sb.append( "

\n" ); sb.append( "\n" ); sb.append( "
Links to Leave As Is
Links to Leave As Is
Link To
" + "\n" ); sb.append( "\n" ); sb.append( "\n" ); return sb.toString(); } else { final FastCat sb = new FastCat( 11 ); sb.append( "L i n k s P r e s u m e d G o o d\n\n" ); sb.append( "Xenu claims the following links are broken, but they have been manually found to\n" ); sb.append( "be good. They should be manually rechecked from time to time. The problem may be\n" ); sb.append( "an unknown SSL certificate authority which needs to be OKed manually,\n" ); sb.append( "(a missing/unknown/uninstalled certificate root authority) or\n" ); sb.append( "it may be the website sends the data, but with not-found status.\n\n" ); sb.append( "On " ); sb.append( BigDate.localToday().toString() ); sb.append( " there were " ); sb.append( presumedGoodCount ); sb.append( " links marked as presumed good despite what Xenu says.\n" ); return sb.toString(); } } /** * import the leave records. Mark them as good no matter how they are marked now * * @return count of leaves imported. * @throws java.io.IOException if trouble reading presumed goods file */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static int importLeaves() throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "importing leaves..." ); } int leavesCount = 0; if ( !Config.leaveCSVFile.exists() ) { // file is optional return leavesCount; } final CSVReader in = new CSVReader( new BufferedReader( new FileReader( Config.leaveCSVFile ) ) ); try { while ( true ) { final String to = in.get(); assert to != null : "null to field in leaves file"; assert to.length() > 0 : "empty to field in leaves file"; // if leaves file has dups, take the last one. leavesCount++; // internally treat like presumed good history.put( to, new LinkInfo( BROKEN_LINKS_PRESUMED_GOOD, to ) ); in.skipToNextLine(); } } catch ( EOFException e ) { /* normal */ } finally { in.close(); } return leavesCount; } /** * import the presumedGood records. Mark them as good no matter how they are marked now * * @return count of presumed Good records imported. * @throws java.io.IOException if trouble reading presumed goods file */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static int importPresumedGoods() throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "importing presumed goods..." ); } int presumedGoodsCount = 0; if ( !Config.presumedGoodCSVFile.exists() ) { // file is optional return presumedGoodsCount; } final CSVReader in = new CSVReader( new BufferedReader( new FileReader( Config.presumedGoodCSVFile ) ) ); try { while ( true ) { final String to = in.get(); assert to != null : "null to field in presumed goods file"; assert to.length() > 0 : "empty to field in presumed goods file"; // if presumedGood file has dups, take the last one. presumedGoodsCount++; history.put( to, new LinkInfo( BROKEN_LINKS_PRESUMED_GOOD, to ) ); in.skipToNextLine(); } } catch ( EOFException e ) { /* normal */ } finally { in.close(); } return presumedGoodsCount; } /** * import CSV file of good and broken links from XENU * * @throws java.io.IOException if problem reading */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static void importXenuPageList() throws IOException { spacedTitle( "importing from Xenu..." ); // Use Xenu Export Page Report save csv file here. final File xenuImportCSVFile = Config.xenuPageCSVFile; verifyXenuListPageAge( xenuImportCSVFile ); final CSVReader in = new CSVReader( EIO.getBufferedReader( xenuImportCSVFile, 64 * 1024, EIO.UTF8 ), '\t' /* tab delimited */, ( char ) 0 /* effectively no quote char */, "", true /* hideComments */, true /* trimQuoted */, true /* trimUnquoted */, true /* allowMultiLineFields */ ); int line = 0; LinkInfo b = null; try { verifyXenuListPageSignature( in ); final Twirler twirler = new Twirler(); while ( true ) { // read tab-separated lines like this: // file:/E:/mindprod/feedback/feedback.html mailto:ross@bluedog.com -6 mail host ok // ross@bluedog.com 13.12.2008 00:09:02 Feedback // file:/E:/mindprod/jgloss/nohassle.html http://www.8844download.com/submit.htm no connection // 8844Download 01.01.2010 10:25:23 hassle-free PAD sites : Java Glossary final String from = in.get(); final String to = in.get(); final int status = in.getInt(); final int lc = in.lineCount(); assert from != null : "null from field from xenu at line " + lc; assert to != null : "null to field from xenu at line " + lc; assert from.length() > 0 : "empty from field from xenu at line " + lc; assert to.length() > 0 : "empty to field from xenu at line " + lc; // Toss jar contents. We do not deal with them. // Xenu gives them to us with malformed URLs. // e.g. file:///E:/mindprod/applet/submitter.jar/com/mindprod/submitter/SubmissionSite$195.class if ( !to.contains( ".jar/" ) ) { // we don't validate status message. We accept all manner of weird stuff. // history has both good and bad links. if ( history.size() >= MAXIMUM_HASHMAP_CAPACITY ) { err.println( "Java theoretical capacity of " + MAXIMUM_HASHMAP_CAPACITY + " HashMap elements " + "exceeded. Sorry. This site is just too big to handle all of a piece." ); System.exit( 5 ); } // Try to avoid hash lookup. see if b we already have will do. // if we are lucky b is already pointing to the history item we want. // if not we must look it up. if ( b == null || !b.getTo().equals( to ) ) { b = history.get( to ); } if ( b != null ) { // this url already on record. // it might be marked ignore by presumed goods or by ignore status from xenu // In either case should stay ignore. // modify an existing entry. if ( b.getStatusKind() != StatusKind.IGNORE ) { b.setStatus( status ); } b.addFroms( from ); } else { // add a new entry. Should only happen if we create new html file on the website. // might be a new ignore. b = new LinkInfo( status, to, from ); b.addFroms( from ); history.put( to, b ); } } // skip rest of fields. in.skipToNextLine(); line++; if ( line % 5000 == 0 ) { twirler.twirl( out ); } } // end while } catch ( EOFException e ) {/* normal loop exit */ // not same as linecount which counts comment lines too. out.println( "\n" + line + " records imported from Xenu" ); } finally { in.close(); } } /** * Prepare a text file summary of everything. * * @param xenuNowBrokenCount how many broken links Xenu recently found. * @param nowBrokenCount how many broken links left of those Xenu found after a reprobe. * @param solidlyBrokenCount how many broken links there for at least brokenForgiveness days. * @param permanentlyRedirectedCount how many links were redirected permanently exported for auto-correct. * @param temporarilyRedirectedCount how many links were redirected temporarily exported for auto-correct. * * @throws java.io.FileNotFoundException if trouble saving the broken count in a file. * @throws java.io.IOException if trouble writing. */ private static void prepareSummaryReportFile( int xenuNowBrokenCount, int nowBrokenCount, int solidlyBrokenCount, int permanentlyRedirectedCount, int temporarilyRedirectedCount ) throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "preparing summary report..." ); } // export the data as HTML // O P E N final FileOutputStream fos = new FileOutputStream( Config.summaryReportFile, false/* append */ ); final PrintStream ps = new PrintStream( fos, false/* auto flush on println */ ); ps.println( "Brokenlinks " + VERSION_STRING + " released: " + RELEASE_DATE ); displayStats( ps, xenuNowBrokenCount, nowBrokenCount, solidlyBrokenCount, permanentlyRedirectedCount, temporarilyRedirectedCount ); ps.println( headForBrokenLinks( justSolidlyBrokens.length, false /* text */ ) ); if ( justSolidlyBrokens.length == 0 ) { ps.println( "no broken links found" ); } else { for ( LinkInfo b : justSolidlyBrokens ) { ps.println(); ps.println( "to: " + b.getTo() + " status: " + b.getVerboseStatusMessage() ); for ( String from : b.getFroms() ) { // without file://localhost/mindprod ps.println( " from: " + from ); } } } ps.println(); // follow with good links. ps.println( headForLeaves( leaves.size(), false ) ); if ( leaves.size() == 0 ) { ps.println( "no leave as is links" ); } else { for ( String leave : leaves ) { ps.println( "leave: " + leave ); } } // follow with good links. ps.println( headForPresumedGoods( presumedGoods.size(), false ) ); if ( presumedGoods.size() == 0 ) { ps.println( "no presumed good links" ); } else { for ( String good : presumedGoods ) { ps.println( "presumed good: " + good ); } } ps.println(); // C L O S E ps.close(); } /** * prune and sort list of leave as is * * @param leavesCount roughly how many leave records there are * * @throws java.io.IOException if trouble reading presumed good file */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static void pruneAndSortLeaves( int leavesCount ) throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "pruning leave as is links..." ); } leaves = new ArrayList<>( leavesCount ); if ( leavesCount == 0 ) { return; } final CSVReader in = new CSVReader( new BufferedReader( new FileReader( Config.leaveCSVFile ) ) ); try { while ( true ) { final String to = in.get(); if ( history.containsKey( to ) ) { leaves.add( to ); } else { out.println( "leaves record no longer needed: " + to ); } in.skipToNextLine(); } } catch ( EOFException e ) {/* normal */ } finally { in.close(); } if ( Config.DEBUGGING ) { spacedTitle( "sorting presumed leave as is links..." ); } Collections.sort( leaves ); } /** * prune and sort list of presumed Good links * * @param presumedGoodsCount how many presumedGoods records there are * * @throws java.io.IOException if trouble reading presumed good file */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static void pruneAndSortPresumedGoods( int presumedGoodsCount ) throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "pruning presumed good links..." ); } /* collect goods that are still relevant */ // presume 40 chars per entry, smaller number gives more conservative estimate. presumedGoods = new ArrayList<>( presumedGoodsCount ); if ( presumedGoodsCount == 0 ) { return; } final CSVReader in = new CSVReader( new BufferedReader( new FileReader( Config.presumedGoodCSVFile ) ) ); try { while ( true ) { final String to = in.get(); if ( history.containsKey( to ) ) { presumedGoods.add( to ); } else { out.println( "presumed good record no longer needed: " + to ); } in.skipToNextLine(); } } catch ( EOFException e ) {/* normal */ } finally { in.close(); } if ( Config.DEBUGGING ) { spacedTitle( "sorting presumed good links..." ); } Collections.sort( presumedGoods ); } /** * remove dead links, ones not mentioned on website any more. */ private static void removeDeadLinks() { if ( Config.DEBUGGING ) { spacedTitle( "dropping dead links from history..." ); } // remove links no longer in use. Leave as iterator because we want to use Iterator.remove // items without from references or links marked ignore. // removes from backing history HashMap // avoids ConcurrentModificationException history.values().removeIf( LinkInfo::isDead ); } /** * reprobe suspect links. Xenu gives false brokens. We use a more thorough slower test. */ private static void reprobeSuspectLinks( final boolean multithread ) { spacedTitle( "reprobing suspect links..." ); // five passes: // pass 0 = non SSL // pass 1 = SSL, no-SNI (broken->SSL with-SNI, leave bad-timestamp) // pass 2 = SSL, with-SNI (broken->SSL no-SNI, leave bad-timestamp) // pass 3 = SSL, without-SnI (broken->SSL with-SNI) // pass 4 = SSL with SNI (broken->SSL no-SNI) for ( int pass = 0; pass < 5; pass++ ) { out.println( "starting pass " + pass ); int linksForPass = 0; final PC pc = new PC( pass ); final BlockingQueue queue; final ThreadPoolExecutor pool; if ( multithread ) { queue = new LinkedBlockingQueue<>( LinkProbe.TASK_POOL_SIZE ); pool = new ThreadPoolExecutor( LinkProbe.THREAD_POOL_SIZE, LinkProbe.THREAD_POOL_SIZE, 3, TimeUnit.SECONDS, queue, new RejectedLinkProbeHandler() ); } else { queue = null; pool = null; } // from HashMap so essentially in scrambled order. Avoids hammering same site over and over. for ( LinkInfo b : history.values() ) { if ( b.isSuspect() ) { b.setShouldWeUpdateBadTimestamp( pc.shouldWeUpdateBadTimestamp ); // only update on second try try { final URL url = new URL( b.getTo() ); final String protocol = url.getProtocol().toLowerCase(); switch ( protocol ) { case "http": b.setUseSNI( false ); if ( pc.probeNonSSL ) { final String host = url.getHost(); if ( host == null || host.length() == 0 ) { throw new MalformedURLException(); } linksForPass++; // howProbe will be 0, 1, 2 if ( multithread ) { pool.execute( new LinkProbe( url, b ) ); } else { new LinkProbe( url, b ).run(); } } break; case "https": if ( pc.probeSNI == b.useSNI() ) { final String host = url.getHost(); if ( host == null || host.length() == 0 ) { throw new MalformedURLException(); } linksForPass++; // howProbe will be 0, 1, 2 if ( multithread ) { pool.execute( new LinkProbe( url, b ) ); } else { new LinkProbe( url, b ).run(); } } break; case "mailto": b.setStatus( BROKEN_LINKS_NOT_TESTED ); break; case "file": // was file: b.setHowProbe( 4 ); linksForPass++; if ( multithread ) { pool.execute( new LinkProbe( url, b ) ); } else { new LinkProbe( url, b ).run(); } break; default: if ( pc.probeNonSSL ) { // was something else. b.setHowProbe( 3 ); linksForPass++; if ( multithread ) { pool.execute( new LinkProbe( url, b ) ); // execute will call run to probe } else { new LinkProbe( url, b ).run(); } } break; } } catch ( MalformedURLException e ) { // so screwed up, could not even parse url. b.setStatus( BROKEN_LINKS_MALFORMED_URL ); err.println( "\n<><>Error<><> reprobe: " + b.getVerboseStatusMessage() + "\n " + b + "\n" ); } catch ( IllegalArgumentException e ) { // so screwed up, could not even parse url. b.setStatus( BROKEN_LINKS_EMPTY_URL ); err.println( "\n<><>Error<><> reprobe: " + b.getVerboseStatusMessage( e.getMessage() ) + "\n " + b + "\n" ); } } } // end for linkInfo out.println( "all link probes submitted for pass " + pass ); if ( multithread ) { pool.shutdown(); try { pool.awaitTermination( 5, TimeUnit.MINUTES ); } catch ( InterruptedException e ) { } } out.println( "all " + linksForPass + " link probes completed for pass " + pass ); } // end for pass } /** * restore serialised history file */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static void restoreHistory() { if ( Config.DEBUGGING ) { spacedTitle( "restoring history..." ); } try { // O P E N final File historyStoreFile = Config.historyBinFile; if ( !historyStoreFile.canRead() ) { throw new IOException( "Can't read History file" ); } DataInputStream dis = EIO.getDataInputStream( historyStoreFile, 32 * 1024 ); int fileFormatVersion = dis.readInt(); final boolean useNewFormat = fileFormatVersion == HISTORY_FILE_FORMAT_VERSION; // get the count of history records off the front of the file // add room for 20% growth. int historyCapacity = Math.min( dis.readInt() * 140 / 100, MAXIMUM_HASHMAP_CAPACITY ); history = new HashMap<>( historyCapacity ); try { // R E A D while ( true ) { if ( useNewFormat ) { final LinkInfo b = LinkInfo.read( dis ); // trust Xenu not to create dups. IF it does, just last one. history.put( b.getTo(), b ); } else { final LinkInfo b = LinkInfo.oldRead( dis ); // trust Xenu not to create dups. IF it does, just last one. history.put( b.getTo(), b ); } } } catch ( EOFException e ) { // C L O S E dis.close(); out.println(); } } catch ( Exception e ) { // Anything at all goes wrong, start over. // Could be ClassNotFoundException or other serialisation mismatch problem. // Size will self correct on next run. err.println( "Warning: Unable to read history file. Starting over." + e.getMessage() ); // length will be 0 if file missing. final int historyCapacity = ( int ) ( Math.min( Config.xenuPageCSVFile.length() / 166, MAXIMUM_HASHMAP_CAPACITY ) ); history = new HashMap<>( historyCapacity ); } } /** * save our history of good and bad links in a binary. * We don't use Serialisation since it gets a heap overflow, * though it would result in a more compact file. * * @throws java.io.IOException if trouble writing. */ private static void saveHistory() throws IOException { if ( Config.DEBUGGING ) { spacedTitle( "saving " + history.size() + " history records..." ); } // O P E N FileOutputStream fos = new FileOutputStream( Config.historyBinFile, false/* append */ ); BufferedOutputStream bos = new BufferedOutputStream( fos, 4 * 1024/* buffsize in bytes */ ); DataOutputStream dos = new DataOutputStream( bos ); // put out the version number so we can potentially change format later and recognise old files. dos.writeInt( HISTORY_FILE_FORMAT_VERSION ); // put the count of how many history records there are on the front dos.writeInt( history.size() ); for ( LinkInfo b : history.values() ) { // W R I T E if ( b.getStatusKind() != StatusKind.IGNORE ) { // filter out ignores so next time ignores dropped by PresumedGoods or Xenu will not hang around. b.write( dos ); // both good and bad, but no need to keep ignores. } } // C L O S E dos.close(); } /** * sort the froms within each Broken in justSolidlyBrokens into alpha order, and justSolidlyBrokens by status/to. */ private static void sortJustSolidlyBrokens() { if ( Config.DEBUGGING ) { spacedTitle( "sorting solidly broken links..." ); } for ( LinkInfo b : justSolidlyBrokens ) { b.sort(); } // sort JustBrokens in order convenient for HTML display by statusMessage and to Arrays.sort( justSolidlyBrokens ); } /** * display a title with spaces between all the letter to make it stand out * * @param title string to display on console expanded. */ static void spacedTitle( String title ) { // add a space after each letter, Best done with StringBuilder rather than FastCat StringBuilder sb = new StringBuilder( title.length() * 2 + 2 ); sb.append( '\n' ); for ( int i = 0; i < title.length(); i++ ) { sb.append( title.charAt( i ) ); sb.append( ' ' ); } // don't sweat the trailing space sb.append( '\n' ); out.println( sb.toString() ); } /** * verify that the xenu file is present and a of reasonable age. * * @param xenuImportCSVFile xenupage.csv file. */ private static void verifyXenuListPageAge( final File xenuImportCSVFile ) { if ( xenuImportCSVFile.canRead() ) { final long xenuTimeStamp = xenuImportCSVFile.lastModified(); LinkInfo.setTime( xenuTimeStamp ); out.println( "Xenu file " + EIO.getCanOrAbsPath( xenuImportCSVFile ) + " was created " + Age.ageIn( xenuTimeStamp ) + " ago." ); } else { err.println( "\n<><>Fatal Error<><> Xenu file " + EIO.getCanOrAbsPath( xenuImportCSVFile ) + " is missing, " + "locked or unreadable." ); System.exit( 2 ); } } /** * check the first 3 fields of the file to be sure we have the correct file. * We leave file positioned at start of second line. * * @param in CSVReader to read first line of file. * * @throws IOException if some problem reading Xenu export. */ private static void verifyXenuListPageSignature( final CSVReader in ) throws IOException { // we should see this pattern, tab separated as the first line of the file // OriginPage LinkToPage LinkToPageStatusCode LinkToPageStatusText final String signature = in.get(); if ( signature.length() > 12 ) { err.println( "<><>Fatal error<><> In Xenu, you accidentally clicked \"Save As\", " + "rather than \"Export Page Map to TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } if ( signature.equals( "Address" ) ) { err.println( "\n<><>Fatal error<><> In Xenu, you accidentally clicked \"Export to TAB-separated File\", " + "rather than \"Export Page Map to TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } if ( !signature.equals( "OriginPage" ) ) { err.println( "\n<><>Fatal error<><> " + Config.xenuPageCSVFile + " does not contain Xenu\'s \"Export " + "Page " + "Map to TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } // we should see this pattern, tab separated as the first line of the file // OriginPage LinkToPage LinkToPageStatusCode LinkToPageStatusText in.skip( 1 ); final String signature2 = in.get(); if ( signature2.equals( "LinkToPageStatusText" ) ) { err.println( "\n<><>Fatal error<><> You used an obsolete version of Xenu to spider. Download and install\n" + "the proper version of Xenu from http://home.snafu.de/tilman/tmp/xenubeta.zip" ); System.exit( 1 ); } if ( !signature2.equals( "LinkToPageStatusCode" ) ) { err.println( "\n<><>Fatal error<><> " + Config.xenuPageCSVFile + " corrupt." ); err.println( "Possibly you used an obsolete version of Xenu to spider. Download and install\n" + "the proper version of Xenu from http://home.snafu.de/tilman/tmp/xenubeta.zip" ); System.exit( 1 ); } // skip over rest of title line in.skipToNextLine(); } /** * Back end to process Xenu output. * Must first manually export from Xenu the xenu page links to * current directory, e.g. E:\env\bl\xenupage.csv * brokenlinks.properties must be in the current directory. * When done results are found in the current directory in various files.. * * @param args not used. */ public static void main( String[] args ) { try { spacedTitle( "Brokenlinks " + VERSION_STRING + " released: " + RELEASE_DATE ); // if no properties file specified on command line, use brokenlinks.properties. Config.getConfiguration(); Config.echoConfiguration(); restoreHistory(); LinkInfo.setTime( System.currentTimeMillis() ); int leaveCount = importLeaves(); int presumedGoodCount = importPresumedGoods(); importXenuPageList(); int xenuNowBrokenCount = countNowBrokenLinks(); LinkInfo.setTime( System.currentTimeMillis() ); removeDeadLinks(); // remove links without froms, but not items with status IGNORE CookieHandler.setDefault( new CookieManager( null /* in ram store */, CookiePolicy.ACCEPT_ALL ) ); LinkInfo.setTime( System.currentTimeMillis() ); // probe broken links, then handle redirects discovered in another pass reprobeSuspectLinks( !DEBUGGING /* false=single thread */ ); int nowBrokenCount = countNowBrokenLinks(); int solidlyBrokenCount = countSolidlyBrokenLinks(); extractSolidlyBrokens( solidlyBrokenCount ); sortJustSolidlyBrokens(); pruneAndSortLeaves( leaveCount ); pruneAndSortPresumedGoods( presumedGoodCount ); // export broken to csv and HTML to display on website for // research. Smaller subset of suspect links. // plus presumed good links. exportBrokenLinksToCsv(); exportToHTML(); // export permanent redirects int permanentlyRedirectedCount = countPermanentlyRedirected(); extractPermanentlyRedirecteds( permanentlyRedirectedCount ); chaseAndExportPermanentlyRedirecteds(); justPermanentlyRedirecteds = null; // free RAM // status of some entries may have changed. Count from original history. permanentlyRedirectedCount = countPermanentlyRedirected(); // export temporary redirects int temporarilyRedirectedCount = countTemporarilyRedirected(); extractTemporarilyRedirecteds( temporarilyRedirectedCount ); chaseAndExportTemporarilyRedirecteds(); justTemporarilyRedirecteds = null; // free RAM // status of some entries may have changed. Count from original history. temporarilyRedirectedCount = countTemporarilyRedirected(); displayStats( out, xenuNowBrokenCount, nowBrokenCount, solidlyBrokenCount, permanentlyRedirectedCount, temporarilyRedirectedCount ); prepareSummaryReportFile( xenuNowBrokenCount, nowBrokenCount, solidlyBrokenCount, permanentlyRedirectedCount, temporarilyRedirectedCount ); justSolidlyBrokens = null; saveHistory(); spacedTitle( "done" ); } catch ( IOException e ) { err.println(); e.printStackTrace( err ); err.println( "\n<><>Fatal error<><> problems reading/writing files" ); err.println(); System.exit( 1 ); } } }
Links Presumed Good
Links Presumed Good
Link To