/* * [Scrape.java] * * Summary: Screenscrape information about sands from various websites. * * Copyright: (c) 2014-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2014-08-14 initial version */ package com.mindprod.sanddepth; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.fastcat.FastCat; import com.mindprod.filetransfer.FileTransfer; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.DecimalFormat; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Screenscrape information about sands from various websites. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2014-08-14 initial version * @since 2014-08-14 */ public class Scrape { // declarations private static final DecimalFormat DF1 = new DecimalFormat( "0.0" ); private static final DecimalFormat DF2 = new DecimalFormat( "0.00" ); // /declarations // methods /** * Get info about CaribSea Arag-alive sands */ private static void aragalive() throws IOException { // http://www.caribsea.com/caribsea/itempage_marine_substrates_aragalive.html // preloaded to E:/temp/aragalive.html out.println( "aragalive" ); final String url = "http://www.caribsea.com/caribsea/itempage_marine_substrates_aragalive.html"; final String type = "Marine Arag-Alive"; final File page = new File( "E:/temp/aragalive.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea cichlid sands */ private static void cichlid() throws IOException { // http://www.caribsea.com/caribsea/itempage_freshwater_substrates_africancichlidmix.html // preloaded to E:/temp/cichlid.html out.println( "cichlid" ); final String url = "http://www.caribsea.com/caribsea/itempage_freshwater_substrates_africancichlidmix.html"; final String type = "African Cichlid Mix"; final File page = new File( "E:/temp/cichlid.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea coraline sands */ private static void coraline() throws IOException { // http://www.caribsea.com/caribsea/itempage_marine_substrates_coraline.html // preloaded to E:/temp/coraline.html out.println( "coraline" ); final String url = "http://www.caribsea.com/caribsea/itempage_marine_substrates_coraline.html"; final String type = "Marine, Coraline"; final File page = new File( "E:/temp/coraline.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea ocean direct sands */ private static void dryAragonite() throws IOException { // http://www.caribsea.com/caribsea/itempage_marine_substrates_aragonite.html // preloaded to E:/temp/dryaragonite.html out.println( "dryAragonite" ); final String url = "http://www.caribsea.com/caribsea/itempage_marine_substrates_aragonite.html"; final String type = "Marine, dry Aragonite"; final File page = new File( "E:/temp/dryaragonite.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea eco sands */ private static void eco() throws IOException { // http://www.caribsea.com/caribsea/itempage_freshwater_substrates_Ecoplanted.html // preloaded to E:/temp/eco.html out.println( "eco" ); final String url = "http://www.caribsea.com/caribsea/itempage_freshwater_substrates_Ecoplanted.html"; final String type = "Eco Complete Planted"; final File page = new File( "E:/temp/eco.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method private static void extract( final String url, final String type, final String big ) throws MalformedURLException { final Pattern pattern = Pattern.compile( "" + "
\\s*" + "

([ A-Za-z0-9&#;\\-_]+)\\s*" + "\\(([0-9\\.]+)\\s*\\-\\s*([0-9\\.]+)mm\\s*\\|\\s*([0-9]+)#/ft3\\)" ); final Matcher m = pattern.matcher( big ); while ( m.find() ) { final String lc = m.group( 1 ); final String remoteImageName = lc + ".png"; final String localImageName = tidyLocalImageName( remoteImageName ); String camel = m.group( 2 ); // remove duplicate of type camel = ST.chopLeadingString( camel, "Eco-Complete™ Planted " ); int place = camel.indexOf( "™" ); if ( place >= 0 ) { // cut out the tm entity camel = camel.substring( 0, place ) + camel.substring( place + "™".length() ); } final double low = Double.parseDouble( m.group( 3 ) ); final double high = Double.parseDouble( m.group( 4 ) ); final double wt = Double.parseDouble( m.group( 5 ) ); final double sg = wt * 0.016_084_6d; FastCat sb = new FastCat( 25 ); sb.append( "CaribSea, " ); sb.append( type, ", ", camel, "" ); sb.append( "" + DF2.format( sg ) + "" ); sb.append( "" + DF1.format( low ) + " - " + DF1.format( high ) + "" ); sb.append( "" ); sb.append( "", wouldFloat( sg ), "" ); out.println( sb.toString() ); fetchImage( remoteImageName, localImageName ); } } /** * download the image from Carib and save it on locaal hard disk in E:\mindprod\image\sand * * @param remoteImageName name at Carib with .png, no dir * @param localImageName name on Mindprod with .png, no dir * * @throws MalformedURLException */ private static void fetchImage( final String remoteImageName, final String localImageName ) throws MalformedURLException { final File image = new File( "E:/mindprod/image/sand/" + localImageName ); if ( image.exists() ) { out.println( EIO.getCanOrAbsPath( image ) + " already exists" ); } else { final URL u = new URL( "http://caribsea.com/images/substrate_" + remoteImageName ); final boolean success = new FileTransfer().download( u, image, false ); if ( !success ) { out.println( "download of " + u + " failed." ); } } } /** * Get info about CaribSea floramax sands */ private static void floramax() throws IOException { // http://www.caribsea.com/caribsea/itempage_freshwater_substrates_floramax.html // preloaded to E:/temp/floramax.html out.println( "floramax" ); final String url = "http://www.caribsea.com/caribsea/itempage_freshwater_substrates_floramax.html"; final String type = "Floramax"; final File page = new File( "E:/temp/floramax.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea instant aquarium sands */ private static void instant() throws IOException { out.println( "instant" ); // http://www.caribsea.com/caribsea/itempage_freshwater_substrates_instantaquarium.html // preloaded into E:/temp/instant.html final String url = "http://www.caribsea.com/caribsea/itempage_freshwater_substrates_instantaquarium.html"; final String type = "Instant Aquarium"; final File page = new File( "E:/temp/instant.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea ocean direct sands */ private static void oceanDirect() throws IOException { // http://www.caribsea.com/caribsea/itempage_marine_substrates_ocean%20direct.html // preloaded to E:/temp/oceandirect.html out.println( "oceanDirect" ); final String url = "http://www.caribsea.com/caribsea/itempage_marine_substrates_ocean%20direct.html"; final String type = "Marine, Ocean Direct"; final File page = new File( "E:/temp/oceandirect.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * Get info about CaribSea floramax sands */ private static void superNatural() throws IOException { //http://www.caribsea.com/caribsea/itempage_freshwater_substrates_supernaturals.html // preloaded to E:/temp/supernatural.html out.println( "supernatural" ); final String url = "http://www.caribsea.com/caribsea/itempage_freshwater_substrates_supernaturals.html"; final String type = "Super Natural"; final File page = new File( "E:/temp/supernatural.html" ); new FileTransfer().download( new URL( url ), page, false ); final String big = HunkIO.readEntireFile( page, EIO.UTF8 ); extract( url, type, big ); }// /method /** * get rid of any upper case _ - chars in local image name * * @param imageName eg. This_sand.png -> thissand.png * * @return tidies name */ private static String tidyLocalImageName( final String imageName ) { return ST.stripNaughtyCharacters( imageName.toLowerCase(), "-_ " ); }// /method private static String wouldFloat( double sg ) { if ( sg <= 1.0d ) { return "This sand would float. Perhaps it sinks after you let it soak."; } else { return ""; } }// /method /** * main * * @param args not used */ public static void main( String[] args ) throws IOException { if ( true ) { aragalive(); cichlid(); coraline(); dryAragonite(); eco(); floramax(); instant(); oceanDirect(); superNatural(); } }// /method // /methods }