/* * [RedirectProbe.java] * * Summary: For thread to probe one site URL to see how it redirects. * * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-02-28 initial version */ package com.mindprod.brokenlinks; import com.mindprod.csv.CSVWriter; import com.mindprod.http.Chase; import java.net.MalformedURLException; import java.net.URL; import java.util.concurrent.Semaphore; import static java.lang.System.*; /** * For thread to probe one site URL to see how it redirects. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2012-02-26 initial version * @since 2012-02-28 */ class RedirectProbe implements Runnable { /** * debugging turns off threads */ static final boolean DEBUGGING = false; /** * number of probes to queue up ready to be executed */ static final int TASK_POOL_SIZE = 32; /** * number of threads in pool to do store probes */ static final int THREAD_POOL_SIZE = 30; /** * throttles how many probes in queue or executing */ private static final Semaphore semaphore = new Semaphore( TASK_POOL_SIZE ); /** * URL to probe */ final URL originalURL; private final CSVWriter w; /** * LinkInfo where info about this link is stored, where we record status of link */ private final LinkInfo b; /** * to probe one url at one site, constructs packet later run() executed. * * @param originalURL URL to probe * @param b LinkInfo where info about this link is stored, where we record status of link * @param w CSVWriter where to log this transaction. */ RedirectProbe( final URL originalURL, final LinkInfo b, final CSVWriter w ) { this.originalURL = originalURL; this.b = b; this.w = w; // the pool will not block when full, so we do it manually. try { if ( !DEBUGGING ) { semaphore.acquire(); } } catch ( InterruptedException e ) { // will block until can increment the semaphore } } /** * export the redirect * * @param w CSVWriter to export to * @param originalLocation original URL as String * @param newLocation redirected URL as String * @param b LinkInfo object where we track this URL */ private static void exportRedirect( CSVWriter w, String originalLocation, String newLocation, LinkInfo b ) { // E C H O on console in one atomic burp. out.println( " redirect " + originalLocation + "\n --> " + newLocation + "\n" ); // W R I T E // one one line the old and new URLs and the pages where they occur. If occurs multiple times on a page, // will be mentioned only once. Only redirects we could determine, others are left, // hopefully fixed on subsequent runs. synchronized ( w ) { w.put( originalLocation ); w.put( newLocation ); // b already locked by caller for ( String from : b.getFroms() ) { w.put( Config.toFileWebsitePrefix + from ); } w.nl(); } } /** * method to run on separate thread. Probes to chase redirect */ public void run() { try { String newLocation; String originalLocation = originalURL.toString(); // We find the new URL from the Location field of response header. // We just resolve one leg of a redirect chain. Otherwise could get confused by mixture of temp & permanent final Chase chase = new Chase(); final String relativeNewLocation = chase.send( originalURL ); int status = chase.getResponseCode(); // e.g. 301 still redirecting StatusKind statusKind = StatusKind.categoriseStatus( status ); switch ( statusKind ) { case TEMP_REDIRECT: case PERM_REDIRECT: break; case BAD: case GOOD: case IGNORE: case UNKNOWN: // not redirected anymore return; } // we don't lock b until the Chase is finished. Rest should be relatively quick. synchronized ( b ) { // we don't change our records. We just export old/new and let user decide if should apply change. if ( relativeNewLocation == null ) { err.println( "\n<><>Warning<><> redirect missing target (" + status + ")\n " + b + "\n" ); return; } else if ( relativeNewLocation.startsWith( "http://" ) || relativeNewLocation.startsWith( "https://" ) ) { newLocation = relativeNewLocation; // replace old completely with new if ( newLocation.equals( originalLocation ) ) { err.println( "\n<><>Warning<><> following link redirected to itself, " + "possible partial redirect, " + "(" + status + ")\n " + b + "\n" ); return; //ignore it } // otherwise newLocation is the correct redirect } else { // merge old and new to make newLocation fully qualified. try { final URL newLocationURL = new URL( originalURL, relativeNewLocation ); newLocation = newLocationURL.toString(); final String newProtocol = newLocationURL.getProtocol().toLowerCase(); if ( ( newProtocol.equals( "http" ) || newProtocol.equals( "https" ) ) && newLocationURL.getPort() == 80 ) { // chop out nugatory :80 will find only on tail of host, not in path. final int place = newLocation.indexOf( ":80" ); newLocation = newLocation.substring( 0, place ) + newLocation.substring( place + 3 ); } // it this point newLocation is the correct redirect // could also have used URI.resolve. } catch ( MalformedURLException e ) { // merge failed err.println( "\n<><>Warning<><> redirect [" + originalLocation + "] + [" + relativeNewLocation + "] failed to merge (" + status + ")\n " + b + "\n" ); return; // ignore it } if ( newLocation == null ) { err.println( "\n<><>Warning<><> redirect [" + originalLocation + "] + [" + relativeNewLocation + "] failed to merge (" + status + ")\n " + b + "\n" ); return; //ignore it } else if ( newLocation.equals( originalLocation ) ) { err.println( "\n<><>Warning<><> following link redirected to itself (" + status + ")\n " + b + "\n" ); return; //ignore it } } // passed all tests. Keep the redirect. exportRedirect( w, originalLocation, newLocation, b ); } } finally { if ( !DEBUGGING ) { semaphore.release(); } } } }