/* * [ProbeAndClassify.java] * * Summary: Probes a list of websites to see which ones require login, require Captchas, backlinks etc. * * Copyright: (c) 2007-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2009-05-17 initial version * 1.1 2009-12-02 hard code in filenames. strip down to host name without www. for compare. */ package com.mindprod.submitter; import com.mindprod.common18.Build; import com.mindprod.common18.EIO; import com.mindprod.common18.Misc; import com.mindprod.common18.ST; import com.mindprod.csv.CSVReader; import com.mindprod.csv.CSVWriter; import com.mindprod.http.Get; import com.mindprod.hunkio.HunkIO; import java.io.EOFException; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /* output in candidate.csv */ /** * Probes a list of websites to see which ones require login, require Captchas, backlinks etc. *
* Expect 2 to 5-column candidates.csv * * Tool to look for possible future submission sites to support. * Designed primarily for Roedy's use to research new sites. * Nothing needed on command line. * * @author Roedy Green, Canadian Mind Products * @version 1.1 2009-12-02 hard code in filenames. strip down to host name without www. for compare. * @since 2009-05-17 */ @SuppressWarnings( { "FieldCanBeLocal", "WeakerAccess" } ) public final class ProbeAndClassify { /** * capacity estimate total nohassle + hassle + dead */ public static final int INITIAL_CAPACITY = 2000; private static final String HREF_REGEX = "[\\d\\p{Lower}!_#/\\.\\-:\\?=&,]+"; private static final String QUOTE_REGEX = "[\"']"; /** * scan for SUBMIT button on author.php sub page. */ private static final Pattern SUBMIT_SUB_FINDER = Pattern.compile( "href=" + QUOTE_REGEX + "(" + HREF_REGEX + ")" + QUOTE_REGEX + "[^>]*" + ">" + "submit/update", Pattern.CASE_INSENSITIVE ); private static final String SEPARATOR_REGEX = "[^\\d\\p{Lower}]"; // not digit or lower case letter /** * scan response for sign need to login */ private static final Pattern LOGIN_FINDER = Pattern.compile( SEPARATOR_REGEX + "(?:login" // non-capturing group + "|logon" + "|account" + "|type=\"password\"" + "|Sign In" + "|signin)" + SEPARATOR_REGEX, Pattern.CASE_INSENSITIVE ); private static final String SPACE_REGEX = "[ \\r\\n\\t]*"; /** * scan for SUBMIT button. Will only detect if href in quotes. */ private static final Pattern SUBMIT_FINDER = Pattern.compile( "href" + SPACE_REGEX + "=" + SPACE_REGEX + QUOTE_REGEX + "(" + HREF_REGEX + ")" + QUOTE_REGEX + "[^>]*" + ">" + SPACE_REGEX + "(?:)*" + SPACE_REGEX + "(?:]*>)*" + "(?:]*>)*" + "(?:)*" + "(?:]*>)*" + "(?:" + "Add a Software" + "|add program" + "|Add Software" + "|add" + "|Dodaj stron" + "|for authors" + "|L\u00e4gg till" + "|melden" + "|Neue" + "|Prefill" + "|Programm eintragen" + "|reichen" + "|Software addieren" + "|Software anmelden" + "|software\\s+submit" + "|soumettre" + "|Subir un Archivo" + "|Submit software" + "|Submit your software" + "|Submit/Update your software" + "|submit" + ")", Pattern.CASE_INSENSITIVE ); /** * look for Article submit */ private static final Pattern ARTICLE_FINDER = Pattern.compile( " article", Pattern.CASE_INSENSITIVE ); /** * look for Article submit */ private static final Pattern ARTICLE_FINDER_EXCEPTION = Pattern.compile( "\\Wpad\\W|You can submit your security and system software here", Pattern.CASE_INSENSITIVE ); /** * scan response for signs you need a captcha to submit */ private static final Pattern CAPTCHA_FINDER = Pattern.compile( "captcha" + "|security code" + "|validation code" + "|verification code" + "|verification" + "|Pr\u00fcfsumme" + "|code you see on the picture", Pattern.CASE_INSENSITIVE ); /** * look for proprietary category request */ private static final Pattern CATEGORY_FINDER = Pattern.compile( "select category" + "|category:" + "|Kategorie:" + "|name=\"Category", Pattern.CASE_INSENSITIVE ); /** * look for proprietary category request */ private static final Pattern CATEGORY_FINDER_EXCEPTION = Pattern.compile( "Detect category from PAD-File", Pattern.CASE_INSENSITIVE ); /** * look for some sort of form to submit to */ private static final Pattern FORM_FINDER = Pattern.compile( "