/* * [GrabAsp.java] * * Summary: Get Submitter sites from ASP website. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-05-19 initial version * 1.1 2012-02-13 ASP totally changed the layout of its files. * 1.2 2012-05-25 update number of ASP pages. * 1.3 2014-04-24 moved website to http://padsites.org/ */ package com.mindprod.submitter; import com.mindprod.common18.Build; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.csv.CSVWriter; import com.mindprod.http.Chase; import com.mindprod.http.Get; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Get Submitter sites from ASP website. *
* Slow, because does probes the Internet three times for each site.. * results appear in E:\com\mindprod\submitter\grabasp.csv * for further processing. * Adjust PAGES before use. * Must run GrabAsp itself inside IntelliJ. * Raw results end up in E:\com\mindprod\submitter\grabasp.csv * * Use filterasp.btm to clean up results. * It will sort then ask you to remove sites you handled previously. * tidied results will end up in E:\com\mindprod\submitter\candidates.cvs * replacing any candidates there already. * * @author Roedy Green, Canadian Mind Products * @version 1.3 2014-04-24 moved website to http://padsites.org/ * @since 2011-05-19 */ public final class GrabAsp { static final String RELEASE_DATE = "2014-04-24"; // I M P O R T A N T ! // update PAGES manually from http://padsites.org/ // adjust -fetch on command line. // last scanned // 2016-04-10 // 2016-04-30 // 2016-05-07 // 2016-05-16 // 2016-06-19 static final String VERSION_STRING = "1.3"; private static final int FIRST_COPYRIGHT_YEAR = 2011; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; private static final String PADSITE_CSV_NAME = Build.MINDPROD_SOURCE + "/submitter/grabasp.csv"; private static final String ASP_HOST = "http://padsites.org"; /** * rows that contain junk */ private static final String[] rowsToIgnore = { "OnOrderByChange", "Macromedia Flash", "Ukrainian", "pad/page0", "*/ private static final Pattern INDIRECT_HOME_URL = Pattern.compile( " | URL | Submission URL | */ private static final Pattern INDIRECT_SUBMIT_URL = Pattern.compile( " | Submission URL | Modified | 2012-01-19 13:02:17 | * on main page is shown as15-01-2014 | Modified(\\p{Digit}{4}\\-\\p{Digit}{2}\\-\\p{Digit}{2}) ", Pattern.CASE_INSENSITIVE ); /** * extracts site name from siteinfo page * CoolScreens */ private static final Pattern SITENAME = Pattern.compile( "([ !%'\\(\\)" + "\\-\\._\\p{Alnum}]+)", Pattern.CASE_INSENSITIVE ); /** * number of pages on the ASP website http://padsites.org. Change comment on grabasp.btm */ private static int pages; /** * get url on ASP site for given page 1 .. PAGES * * @param pageNumber page number * * @return URL on asp site of that page of database. * @throws java.net.MalformedURLException if bad URL generated */ private static URL URLForPage( int pageNumber ) throws MalformedURLException { return new URL( ASP_HOST + "/page" + ST.toLZ( pageNumber, 3 ) + "/" ); } /** * get local file for given page 1 .. PAGES * * @param pageNumber page number * * @return local file where that page's html is stored.. */ private static File fileForPage( int pageNumber ) { return new File( Build.MINDPROD_SOURCE + "/submitter/" + ST.toLZ( pageNumber, 3 ) + ".html" ); } /** * fetch all ASP web pages from their site * * @throws java.io.IOException if cannot save page. */ private static void getASPWebPages() throws IOException { Get g = new Get(); for ( int pageNumber = 1; pageNumber <= pages; pageNumber++ ) { final String page = g.send( URLForPage( pageNumber ), Get.UTF8 ); HunkIO.writeEntireFile( fileForPage( pageNumber ), page ); } } /** * dump out regexes */ private static void proofreadRegexes() { out.println( SITENAME ); out.println( INDIRECT_HOME_URL ); out.println( INDIRECT_SUBMIT_URL ); } /** * scan one page of the ASP website, extracting padsites. Slow because it chases each URL to resolve redirects. * * @param page HTML from one page from ASP site * @param w CSVWrite to export found PADSites to * * @throws MalformedURLException if bad URL generated */ private static void scanPage( String page, CSVWriter w ) throws MalformedURLException { int from = 0; rowLoop: while ( true ) { final int start = page.indexOf( " |