/* * [ParkedProbe.java] * * Summary: For thread to probe one site URL to see if it is parked. * * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-05-06 initial version * 1.1 2012-05-08 suppress some false alarms * 1.2 2012-05-15 add text-link-ads.com */ package com.mindprod.findparked; import com.mindprod.common18.Misc; import com.mindprod.fastcat.FastCat; import com.mindprod.http.Get; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * For thread to probe one site URL to see if it is parked. * * @author Roedy Green, Canadian Mind Products * @version 1.2 2012-05-15 add text-link-ads.com * @since 2012-05-06 */ class ParkedProbe implements Runnable { // declarations /** * number of probes to queue up ready to be executed */ static final int TASK_POOL_SIZE = 32; /** * number of threads in pool to do store probes */ static final int THREAD_POOL_SIZE = 30; /** * javascript cheesy relocation * e.g. * */ private static final Pattern CHEESY_REDIRECT_FINDER = Pattern.compile( "" ); /** * true if you want extra debug output */ private static final boolean DEBUGGING = false; /** * timetout to connect in seconds */ private static final int CONNECT_TIMEOUT_SECONDS = 100; /** * timeout to wait for read in seconds */ private static final int READ_TIMEOUT_SECONDS = 60; /** * markers that indicate this site domaim no longer exists. All must be pure lower case */ private static final String[] BROKEN_MARKERS = { "shawcaassist", }; /** * markers that indicate this page is parked. case-insentitive * We do not filter out sites that look parked but are not including: * godlovesfags.blogspot.ca * www.download-tipp.de * www.shareware-freeware-demo.com/ * see e:\mindprod\findparked/notparked.txt */ private static final String[] PARKED_MARKERS = { ".web.com", "/__media__/pics/7463/logo.gif", "/domainpark/", "/web.com", "1plus.net", "2258_Directi-Green-CellPhones1.jpg", "2freehosting.com", "account suspended", ">Future home of something quite cool.<", ">there is no website configured at this address.<", ">this account has been suspended<", ">Website Disabled<", "Account Suspended", "admedia.com", "afternic.com", "b00kmarks.com", "bluehost.com\">web hosting", "bodis.com", "bodisparking\\.com", "buydomains.com", "call buydomains.com", "cdnpark.com", "cybersync.com", "directnic.com", "dittodomains.com", "domain may be for sale", "domain name expired", "domain parked courtesy of 101domain.com", "domainapps.com", "domaindirect.com", "domainguru.com", "domainhop.com", "domaininformer.com", "domainnamesales.com", "domainpark.cgi", "domainrightnow.com", "domainspa.com", "domainsponsor.com", "domainsystems.com", "dotzup.com", "dsparking.com", "dsultra.com", "er geen hosting voor dit domein bestaat", "fabulous.com", "futurequest.net", "fwdservice.com", "godaddy.com/park/", "goldkey.com", "has been suspended", "has been terminated due to a violation of the directnic free hosting", "hdr_cshprkng_prkd.gif", "hdr_parked_ppc", "hdr_sorry.gif", "hostek.com", "hostindex.com", "hostmoster.com", "hover.com", "hslogo_site_found_customers_175x40.gif", "hugedomains.com", "id=\"parking_form\"", "imageHost=http%3a%2f%2fimages.sitesense\\-oo.com", "img.sedoparking.com", "ipage.com", "ist2_746781_female_student.jpg", "landingparent", "moniker.com", "MrHost - Web Hosting Engine", "namecheap.com", "namedrive.com", "netvisibility.com", "oversee.net", "Overview report - SEOMON.com", "pair Networks", "parked.com", "parking4income.com", "parking_form", "parkingcrew.net", "parkingdots.com", "parkingsite.com", "parkitnow.com", "parkmynames.com", "parkpage.com", "parkquick.com", "premiumtraffic.com", "readyhosting.com", "revenuedirect.com", "rmgserving.com", "searchportalinformation.com", "securesignup.net", "sedo.com", "sedo.de", "sedoparking.com/frmpark", "sedopro.com", "shawcaassist", "ssls.com", "siteparker.com", "Site Suspended - This site has stepped out for a bit", "skenzo.com", "smartname.com", "snapnames.com", "streamic.com", "style/style1_54.css", "text-link-ads.com", "the owner of this domain has not yet uploaded their website", "this domain may be for sale", "this web page is parked for free, courtesy of", "this web page is parked free, courtesy of godaddy.com", "This website has been suspended.", "This your website's holding page.", "trafficclub.com", "trafficz.com", "var parkingData", "verio.com", "webcom.com", "Webmaster please contact", "website is for sale", "whypark.com", }; /** * domains of parking site companies, case-insensitive. We don't consider the site itself parked. */ private static final String[] PARKING_DOMAINS = { "101domain.com", "1plus.net", "2freehosting.com", "admedia.com", "afternic.com", "archive.org", "bluehost.com", "bodis.com", "bodisparking.com", "b00kmarks.com", "buydomains.com", "cdnpark.com", "ctmdev.com", /* exception */ "cybersync.com", "devgalaxy.com", "directnic.com", "dittodomains.com", "domainapps.com", "domaindirect.com", "domainguru.com", "domainhop.com", "domaininformer.com", "domainnamesales.com", "domainpark.cgi", "domainrightnow.com", "domainspa.com", "domainsponsor.com", "domainsystems.com", "dotzup.com", "downloadarsivi.com", "dsnextgen.com", "dsparking.com", "dsultra.com", "fabulous.com", "fast64.com", "fwdservice.com", "futurequest.net", "fyxm.net", "godaddy.com", "goldkey.com", "hostek.com", "hostindex.com", "hostmonster.com", "hover.com", "howardforums.com", /* exception */ "hugedomains.com", "hwg.org", "info-zip.org", "ipage.com", "jpost.com", "moniker.com", "enamecheap.com", "namedrive.com", "netvisibility.com", "oversee.net", "pair.com", "parked.com", "parking4income.com", "parkingdots.com", "parkingcrew.net", "parkingsite.com", "parkitnow.com", "parkmynames.com", "parkpage.com", "parkquick.com", "patriotsquestion911.com", "premiumtraffic.com", "readyhosting.com", "revenuedirect.com", "rmgserving.com", "searchportalinformation.com", "sedo.com", "sedo.de", "sedoparking.com", "sedopro.com", "siteparker.com", "skenzo.com", "smartname.com", "snapnames.com", "ssls.com", "stats.directnic.com", "streamic.com", "text-link-ads.com", "This site is temporarily unavailable", /* iPage */ "This Web page is parked for FREE, courtesy of markerDomains = new HashSet<>( Arrays.asList( PARKING_DOMAINS ) ); /** * look for a a redirect of the form, not doen with meta or response code * window.top.location='http://ww2.mightyfiles.com/?folio=435329566&bkt=9657' */ private static final Pattern EL_CHEAPO_REDIRECT_FINDER = Pattern.compile( "window.top.location=\'([ \\w!#-\\&\\(-\\.\\+\\-/:=\\?@\\[-\\^\\|]+)\'" ); /** * get url of frame or iframe */ private static final Pattern FRAME_FINDER = Pattern.compile( " 0 ) { return page; } else { if ( DEBUGGING ) { final FastCat sb = new FastCat( 13 ); sb.append( "\n ", url ); sb.append( ", not responding properly" ); sb.append( "\n error code:", responseCode, " ", responseMessage ); if ( interruptResponseMessage.length() > 0 ) { sb.append( "\n ", interruptResponseMessage ); } sb.append( "\n page length: ", ( page == null ? 0 : page.length() ) ); err.println( sb.toString() ); } return null; } }// /method /** * does this age contain a marker? * * @param page contents of home page * @param originalURL human name of who parked * @param exception domain of the parking compnay, an exception * * @return true if had marker */ private static boolean hasMarker( final String page, final String originalURL, String exception ) { final String pagelc = page.toLowerCase(); for ( String marker : BROKEN_MARKERS ) { if ( pagelc.contains( marker ) && !markerDomains.contains( exception ) ) { out.println( "\n " + originalURL + ", no such domain, Marker: " + marker ); return true; } } // end for for ( String marker : PARKED_MARKERS ) { // don't report marker websites, they will have their own names somewhere. if ( pagelc.contains( marker ) && !markerDomains.contains( exception ) ) { // outdented from rest out.println( "\n>>>> " + originalURL + ", probably parked, Marker: " + marker ); return true; } } // end for return false; }// /method /** * method to run on separate thread */ public void run() { try { String originalURL = url.toString(); URL currentURL = url; String exception = Misc.getDomain( currentURL ); String page = fetchPage( currentURL ); URL redirectedURL; if ( page == null ) { return; } final Matcher r = EL_CHEAPO_REDIRECT_FINDER.matcher( page ); if ( r.find() && !r.group( 1 ).contains( ":" ) ) { try { redirectedURL = new URL( currentURL, r.group( 1 ) ); } catch ( MalformedURLException e ) { err.println( "\n url:[" + currentURL + "/" + r.group( 1 ) + "], trouble while processing" ); err.println( e ); e.printStackTrace( err ); return; } String redirectedPage = fetchPage( redirectedURL ); if ( redirectedPage != null ) { currentURL = redirectedURL; page = redirectedPage; } } final Matcher r2 = CHEESY_REDIRECT_FINDER.matcher( page ); if ( r2.find() && !r2.group( 1 ).contains( ":" ) ) { try { redirectedURL = new URL( currentURL, r2.group( 1 ) ); } catch ( MalformedURLException e ) { err.println( "\n url:[" + currentURL + "/" + r2.group( 1 ) + "], trouble while processing" ); err.println( e ); e.printStackTrace( err ); return; } String redirectedPage = fetchPage( redirectedURL ); if ( redirectedPage != null ) { currentURL = redirectedURL; page = redirectedPage; } } if ( hasMarker( page, originalURL, exception ) ) { return; } // look for a all the frames and iframes //