/* * [FindParked.java] * * Summary: Probes a list of websites to see which are sitting on parking sites. * * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-05-06 initial version * 1.1 2012-05-08 suppress some false alarms * 1.2 2012-05-15 add text-link-ads.com * 1.3 2012-11-21 convert to new format Xenu files with more accurate status. * 1.4 2014-06-25 easier to read output * 1.5 2014-07-09 now also searches text in embedded frames for parked markers * 1.6 2014-07-12 handles JavaScript redirects, and iframes. Add to lists of parking domains. * 1.7 2016-02-26 now supports a list of execptions, that look like they are parked, but are not. */ package com.mindprod.findparked; import com.mindprod.common18.EIO; import com.mindprod.csv.CSVReader; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import static java.lang.System.*; /** * Probes a list of websites to see which are sitting on parking sites. * * @author Roedy Green, Canadian Mind Products * @version 1.7 2016-02-26 now supports a list of execptions, that look like they are parked, but are not. * @see com.mindprod.findparked.ParkedProbe * @since 2012-05-06 */ public class FindParked { private static final int FIRST_COPYRIGHT_YEAR = 2012; /** * undisplayed copyright notice */ private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; /** * date this version was released. */ private static final String RELEASE_DATE = "2016-02-26"; /** * how to use the command line */ private static final String USAGE = "\nFindParked needs a list of files containing the URLS to check."; /** * embedded version string. */ private static final String VERSION_STRING = "1.7"; private static boolean DEBUGGING = false; /** * Collect all URLs from files on command line * * @param args list of files from the command line * * @return ArrayList of all the URLs to probe. */ private static ArrayList collectAllURLsToTest( String[] args ) throws IOException { out.println( "collecting all URLs to process..." ); final ArrayList all = new ArrayList<>( 2000 ); for ( String arg : args ) { final CSVReader r = new CSVReader( new FileReader( arg ) ); String urlString = null; String prev = null; try { while ( true ) { // capture raw with lead http://www urlString = r.get().toLowerCase(); r.skipToNextLine(); all.add( urlString ); prev = urlString; } } catch ( EOFException e ) { r.close(); } } // end for return all; } /** * remove duplicates * * @param all all URLs found so far, including dups, in alpha order. * @param quiet true if should suppress reporting duplicates * * @return list without duplicates */ private static ArrayList dedup( ArrayList all, boolean quiet ) { out.println( "deduping " + all.size() + " URLs..." ); final ArrayList deduped = new ArrayList<>( all.size() ); String prev = null; for ( String u : all ) { if ( !u.equals( prev ) ) { deduped.add( u ); prev = u; } else { if ( !quiet ) { out.println( "dropping duplicate: " + u ); } } } return deduped; } /** * Find parked sites in given list of urls. * * @param files files containing lists of urls to test * @param quiet true if should suppress reporting duplicates * * @throws IOException */ static void findParked( final String[] files, final boolean quiet ) throws IOException { final ArrayList all = collectAllURLsToTest( files ); out.println( "sorting URLs..." ); Collections.sort( all ); final ArrayList deduped = dedup( all, quiet ); removeExceptions( deduped ); out.println( "probing " + deduped.size() + " URLs..." ); final BlockingQueue queue = new LinkedBlockingQueue<>( ParkedProbe.TASK_POOL_SIZE ); final ThreadPoolExecutor pool = new ThreadPoolExecutor( ParkedProbe.THREAD_POOL_SIZE, ParkedProbe.THREAD_POOL_SIZE, 3, TimeUnit.SECONDS, queue, new RejectedParkProbeHandler() ); for ( String urlString : deduped ) { // Submit URL to thread pool to be executed. pool.execute( new ParkedProbe( new URL( urlString ) ) ); } out.println( "all probes submitted" ); pool.shutdown(); try { pool.awaitTermination( 5, TimeUnit.MINUTES ); } catch ( InterruptedException e ) { // just carry on } out.println( "all probes completed" ); } /** * remove exceptions from the list of urls we are planning to test. * * @param deduped * * @throws IOException */ private static void removeExceptions( final ArrayList deduped ) throws IOException { out.println( "Collecting exceptions from " + EIO.getCanOrAbsPath( new File( "exceptions.csv" ) ) ); final CSVReader r = new CSVReader( new BufferedReader( new FileReader( "exceptions.csv" ), 8092 ) ); final HashSet exceptions = new HashSet( 100 ); try { // collect the exceptions while ( true ) { final String exception = r.get(); exceptions.add( exception ); } } catch ( EOFException e ) { } // replace the exceptions with null final int size = deduped.size(); final String[] dedupeda = deduped.toArray( new String[ size ] ); for ( int i = 0; i < size; i++ ) { if ( exceptions.contains( dedupeda[ i ] ) ) { dedupeda[ i ] = null; } } deduped.clear(); // restore only the non-nulls; for ( int i = 0; i < size; i++ ) { if ( dedupeda[ i ] != null ) { deduped.add( dedupeda[ i ] ); } } } /** * Scan all CSV files on command line for parked sites. * Files must be CSV files, usually just one column. If there is more than one column, extras will be ignored. * Parked markers are embedded in ParkedProbe * * @param args list of CSV files to be checked * * @see com.mindprod.findparked.ParkedProbe */ public static void main( String[] args ) throws IOException { findParked( args, false ); } // end main } // end class