/* * [FindParkedInXenu.java] * * Summary: Check if links found by Xenu are parked. * * Copyright: (c) 2008-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-05-06 initial version. * 1.1 2012-05-08 suppress some false alarms * 1.2 2012-05-15 add text-link-ads.com * 1.3 2012-11-21 convert to new format Xenu files with more accurate status. */ package com.mindprod.findparked; import com.mindprod.brokenlinks.StatusKind; import com.mindprod.common18.EIO; import com.mindprod.common18.Misc; import com.mindprod.csv.CSVReader; import com.mindprod.csv.CSVWriter; import com.mindprod.hunkio.HunkIO; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URL; import static java.lang.System.*; /** * Check if links found by Xenu are parked. * * @author Roedy Green, Canadian Mind Products * @version 1.3 2012-11-21 convert to new format Xenu files with more accurate status. * @since 2012-05-06 */ public class FindParkedInXenu { /** * import CSV file of good and broken links from XENU * Must use special version with extra status field. * * @throws java.io.IOException if problem reading */ @SuppressWarnings( { "InfiniteLoopStatement" } ) private static void importXenuPageList( final String xenuImportCSVFile ) throws IOException { out.println( "importing from Xenu..." ); // Use Xenu Export Page Report save csv file here. final File csv = new File( xenuImportCSVFile ); // reader, separatorChar, quoteChar, commentChars, hideComments, trimQuoted, trimUnquoted, // allowMultipleLineFields final CSVReader r = new CSVReader( EIO.getBufferedReader( csv, 1024 * 1024, EIO.UTF8 ), '\t' /* tab delimited */, ( char ) 0 /* effectively no quote char */, "", true, true /* trimQuoted */, true /* trimUnquoted */, true ); final File temp = HunkIO.createTempFile( "tempfindparkedxenu", ".csv", null ); out.println( EIO.getCanOrAbsPath( temp ) ); final PrintWriter pw = EIO.getPrintWriter( temp, 768 * 1024, EIO.UTF8 ); final CSVWriter w = new CSVWriter( pw ); try { final String signature = r.get(); if ( signature.length() > 12 ) { err.println( "Fatal error: In Xenu, you accidentally clicked \"Save As\", " + "rather than \"Export Page Map to TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } if ( signature.equals( "Address" ) ) { err.println( "Fatal error: In Xenu, you accidentally clicked \"Export to TAB-separated File\", " + "rather than \"Export Page Map to TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } if ( !signature.equals( "OriginPage" ) ) { err.println( "Fatal error: " + xenuImportCSVFile + " does not contain Xenu\'s \"Export Page Map to " + "TAB-separated File\". To recover, rerun Xenu." ); System.exit( 1 ); } // skip over rest of title line r.skipToNextLine(); while ( true ) { // read tab-separated lines like this: // file:/E:/mindprod/feedback/feedback.html mailto:ross@bluedog.com mail host 200 ok // ross@bluedog.com 13.12.2008 00:09:02 Feedback // file:/E:/mindprod/jgloss/nohassle.html http://www.8844download.com/submit.htm no connection // 8844Download 01.01.2010 10:25:23 hassle-free PAD sites : Java Glossary r.skip( 1 ); final String urlString = r.get(); final int status = r.getInt(); // we don't validate status message. We accept all manner of weird stuff. // skip rest of fields. r.skipToNextLine(); try { final URL url = new URL( urlString ); final String protocol = url.getProtocol(); // discard local file:, ftp: news: and sites currently bad. if ( protocol.equals( "http" ) || protocol.equals( "https" ) ) { final StatusKind kind = StatusKind.categoriseStatus( status ); switch ( kind ) { case BAD: case IGNORE: break; case GOOD: case TEMP_REDIRECT: case PERM_REDIRECT: case UNKNOWN: default: // save this URL for future park check processing. Will be deduped later. // prune off path. Unfortunately will prune /index.html etc as well. w.put( Misc.getHomeURL( url ).toString() ); w.nl(); break; } } } catch ( MalformedURLException e ) { // probably just a news:// news: // ignore it } } // end while } catch ( EOFException e ) { } finally { r.close(); w.close(); } out.println(); // feed file we just created to FindParked FindParked.findParked( new String[] { EIO.getCanOrAbsPath( temp ) }, true /* quiet, no dup reporting */ ); // if delete fails, will be killed later by Batik temp.delete(); } /** * Back end to process Xenu output. * Must first manually export from Xenu the xenu page links to E:\com\mindprod\brokenlinks\xenupage.csv * * @param args where xenu export file is e.g. E:\env\bl\xenupage.csv */ public static void main( String[] args ) { try { importXenuPageList( args[ 0 ] ); } catch ( IOException e ) { err.println(); e.printStackTrace( err ); err.println( "Fatal error: problems reading/writing files" ); err.println(); System.exit( 1 ); } } }