/* * [ImportGigabyte.java] * * Summary: Import all data about motherboards from the Gigabyte Website. * * Copyright: (c) 2011-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.7+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2011-02-14 initial version */ package com.mindprod.mother; import com.mindprod.common17.BigDate; import com.mindprod.csv.CSVReader; import com.mindprod.htmlmacros.support.Configuration; import com.mindprod.htmlmacros.support.ConfigurationForMindprod; import com.mindprod.http.Get; import com.mindprod.hunkio.HunkIO; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Import all data about motherboards from the Gigabyte Website. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2011-02-14 initial version * @since 2011-02-14 */ public class ImportGigabyte extends ImportManufacturer { /** * where master list of all Gigabyte motherboards are */ private static final String[] INDEX_URLS = { "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=3&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=4&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=5&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=6&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=7&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=8&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM3", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=9&par=2&val=2&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=3&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=4&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=5&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=6&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2+", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=7&par=2&val=3&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?&p=1&par=2&val=4&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=4&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=3&par=2&val=4&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=4&par=2&val=4&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=5&par=2&val=4&lgc=&tp=0&ss=0", "AMD AM2", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?&p=6&par=2&val=4&lgc=&tp=0&ss=0", "AMD BGA FT1", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=25&lgc=&tp=0&ss=0", "Intel 1366", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=1&lgc=&tp=0&ss=0", "Intel 1366", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=1&lgc=&tp=0&ss=0", "Intel 1155", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=24&lgc=&tp=0&ss=0", "Intel 1155", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=24&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=11&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=2&par=2&val=11&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=3&par=2&val=11&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=4&par=2&val=11&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=5&par=2&val=11&lgc=&tp=0&ss=0", "Intel 1156", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=6&par=2&val=11&lgc=&tp=0&ss=0", "Intel BGA 559", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=23&lgc=&tp=0&ss=0", "Intel 479/437", "http://www.gigabyte.com/products/pl-mb-list_ajax.aspx?p=1&par=2&val=8&lgc=&tp=0&ss=0", }; /** * mindprod configuration */ private static final Configuration configuration; private static final File sourceDir; /** * looks for Audio channels */ private static final Pattern AUDIO_CHANNEL_FINDER = Pattern.compile( "([\\.\\d]+)[\\- ]channel", Pattern.CASE_INSENSITIVE ); /** * looks for info about the form factor */ private static final Pattern FORM_FACTOR_FINDER = Pattern.compile( ">Form Factor(?:
    \\s*
  1. )?([\\p{Alnum}\\- ]+)[;:] " + "([0-9\\.]+)\\s*cm x ([0-9\\.]+)\\s*cm", Pattern.CASE_INSENSITIVE ); /** * looks for count of IDE ports */ private static final Pattern IDE_FINDER = Pattern.compile( "(\\d{1,2}) x IDE[E]*", Pattern.CASE_INSENSITIVE ); // >Up to 12 USB 2.0/1.1 ports (8 on the back panel, 4 via the USB brackets connected to the internal USB headers) /** * looks for max ram capacity */ private static final Pattern MAX_GIG_FINDER = Pattern.compile( ">Memory
      [\\p{Print}\\s]+?([0-9]+)\\s*GB", Pattern.CASE_INSENSITIVE ); /** * look for links like this: * "name": "GA-880GMA-UD2H (rev. 2.0)", * "value": "3424", */ private static final Pattern MB_FINDER = Pattern.compile( "\"name\": \"([\\p{Alnum} \\-]+) \\(rev\\. ([\\p{Digit}\\.]+)\\)\"," + "\\s+\"value\": \"(\\p{Digit}+)\"", Pattern.CASE_INSENSITIVE ); /** * looks for info about RAM */ private static final Pattern MEMORY_TYPE_FINDER = Pattern.compile( ">Memory
        \\s*
      1. \\p{Print}+?(DDR2|DDR3)", Pattern.CASE_INSENSITIVE ); /** * looks for ram speed */ private static final Pattern RAM_SPEED_MHZ_FINDER = Pattern.compile( ">Memory
          [\\p{Print}\\s]+?(?:DDR2|DDR3) ((\\(Note " + "\\d\\)|\\d{3,4}\\s*\\(O\\.C\\.\\)|\\d{3,4}\\s*\\(OC\\)|\\d{3,4}|\\s*/\\s*)+)", Pattern.CASE_INSENSITIVE ); private static final Pattern RAM_SPEED_MHZ_SPLITTER = Pattern.compile( "\\s*/\\s*", Pattern.CASE_INSENSITIVE ); private static final Pattern SATA2_FINDER_1 = Pattern.compile( "(\\d{1,2})[ x]*SATA 3Gb/s connector", Pattern.CASE_INSENSITIVE ); private static final Pattern SATA2_FINDER_2 = Pattern.compile( "(\\d{1,2}) SATA connector", Pattern.CASE_INSENSITIVE ); private static final Pattern SATA3_FINDER = Pattern.compile( "(\\d{1,2})[ x]*SATA 6Gb/s connector", Pattern.CASE_INSENSITIVE ); /** * looks for USB2Internal ports capacity */ private static final Pattern USB2_INTERNAL_FINDER_1 = Pattern.compile( "USB 2.0\\p{Print}+?(\\d+) via the USB", Pattern.CASE_INSENSITIVE ); /** * looks for USB2Internal ports capacity */ private static final Pattern USB2_INTERNAL_FINDER_2 = Pattern.compile( "additional (\\d{1,2}) USB 2\\.0/(?:1\\.1)? ports by cables", Pattern.CASE_INSENSITIVE ); // Up to 8 USB 3.0/2.0 ports (4 on the back panel, 4 via the USB brackets /** * looks for USB2Rear ports capacity */ private static final Pattern USB2_REAR_FINDER_1 = Pattern.compile( "USB 2\\.0\\p{Print}+?\\((\\d+)(?: ports)? on the back panel", Pattern.CASE_INSENSITIVE ); /** * looks for USB2Rear ports capacity */ private static final Pattern USB2_REAR_FINDER_2 = Pattern.compile( "(\\d{1,2})USB 2\\.0(|:/1.1)? ports", Pattern.CASE_INSENSITIVE ); /** * looks for USB3Internal ports capacity */ private static final Pattern USB3_INTERNAL_FINDER = Pattern.compile( "USB 3\\.0\\p{Print}+?(\\d+) via the USB", Pattern.CASE_INSENSITIVE ); /** * looks for USB3Rear ports capacity */ private static final Pattern USB3_REAR_FINDER = Pattern.compile( "USB 3\\.0\\p{Print}+?(\\d+)(?: ports)? on the back panel", Pattern.CASE_INSENSITIVE ); /** * looks for USB3 ports capacity */ private static final Pattern USB3_TOTAL_FINDER = Pattern.compile( "(\\d+)[x ]*USB 3\\.0", Pattern.CASE_INSENSITIVE ); /** * looks for integrated Video */ private static final Pattern VIDEO_FINDER_1 = Pattern.compile( "GRAPHICSº (?:|On Chip|Integrated|)" + "*[\\( ]*(\\p{Alnum}+)", Pattern.CASE_INSENSITIVE ); /** * looks for integrated Video */ private static final Pattern VIDEO_FINDER_2 = Pattern.compile( "(H55|H57|H61|H67|945GC) Express", Pattern.CASE_INSENSITIVE ); /** * looks for integrated Video */ private static final Pattern VIDEO_FINDER_3 = Pattern.compile( "(630a|690G|740G|760G|780G|785G|790GX|880G|890GX|GeForce 8100|GeForce 8200|nForce 750a|AMD Radeon HD 6310)", Pattern.CASE_INSENSITIVE ); /** * looks for integrated Video */ private static final Pattern VIDEO_FINDER_4 = Pattern.compile( "(Multi-Graphics Technology)", Pattern.CASE_INSENSITIVE ); /** * looks for integrated Video */ private static final Pattern VIDEO_VALIDATOR = Pattern.compile( "(?!graphics (technology|slot|card))graphics|onboard graphics|video|HDMI", Pattern.CASE_INSENSITIVE ); static { configuration = new ConfigurationForMindprod(); // combine dirsWithMacros and dirsWithIncludes into dirsToProcess; sourceDir = new File( configuration.getSourceDirWithSlashes() ); } /** * collect all motherboard specs from the Gigabyte website * * @throws java.io.IOException if urls malformed or I/O trouble * @throws java.sql.SQLException if cannot write to database */ public static void fetchAllMBs() throws IOException, SQLException { manufacturer = Manufacturer.GIGABYTE; // no parms needed conn = connect(); for ( int i = 1; i < INDEX_URLS.length; i += 2 ) { final String indexURL = INDEX_URLS[ i ]; final Get get = new Get(); final String htmlListOfGigabyteMBs = get.send( new URL( indexURL ), Get.UTF8 ); final int responseCode = get.getResponseCode(); if ( responseCode >= 300 || htmlListOfGigabyteMBs == null ) { err.println( "Could not fetch a master index Gigabyte page " + indexURL ); System.exit( 1 ); } final Matcher m = MB_FINDER.matcher( htmlListOfGigabyteMBs ); // Matchers are used both for matching and // finding. while ( m.find() ) { assert m.groupCount() == 3 : "bug in regex"; model = m.group( 1 ); revision = m.group( 2 ); // links to individual mbs look like this // http://www.gigabyte.com/products/product-page.aspx?pid=3748#sp manufacturerPartNo = m.group( 3 ); // e.g. http://www.gigabyte.com/products/product-page.aspx?pid=2849#sp out.println( manufacturer.ordinal() + ", " + model + ", " + revision + ", " + manufacturerPartNo + "," + " false" ); // learning URL // "http://www.gigabyte.com/products/product-page.aspx?pid=" + 3748 + "#sp"; // mining URL // "http://www.gigabyte.com/products/product-page_ajax.aspx?&t=sp&pid=" + 3748 + "&dlt=&cg=2&ck=2"; final Get miningPage = new Get(); final String rawMBSpecs = miningPage.send( new URL( manufacturer.miningURL( manufacturerPartNo ) ), Get.UTF8 ); final int responseCode2 = get.getResponseCode(); if ( responseCode2 >= 300 || rawMBSpecs == null ) { err.println( "Could not fetch Gigabyte motherboard page " + model ); System.exit( 1 ); } HunkIO.writeEntireFile( new File( "E:/mb/gigabyte/" + model + ".html" ), rawMBSpecs, HunkIO.UTF8 ); } // end find loop } } /** * Put the skeleton info into the DB. * * @throws IOException if urls malformed or I/O trouble * @throws java.sql.SQLException if cannot write to database */ public static void skeleton() throws IOException, SQLException { manufacturer = Manufacturer.GIGABYTE; lastUpdated = BigDate.localToday(); conn = connect(); final PreparedStatement inserter = conn.prepareStatement( "INSERT INTO mboards( manufacturer, model, revision, manufacturerPartNo, lastUpdated) VALUES(?,?,?,?," + "?);" ); CSVReader r = new CSVReader( new BufferedReader( new FileReader( new File( sourceDir, "mother/gigabyte.csv" ) ) ) ); try { while ( true ) { // manufacturer, model, revision, mfr no, verified, last-update r.skip( 1 ); // mfr model = r.get(); revision = r.get(); manufacturerPartNo = r.get(); r.skipToNextLine(); // record our findings in SQL out.println( manufacturer.ordinal() + ", " + model + ", " + revision + ", " + manufacturerPartNo + "," + " false, " + lastUpdated ); inserter.setInt( 1, manufacturer.ordinal() ); inserter.setString( 2, model ); inserter.setString( 3, revision ); inserter.setString( 4, manufacturerPartNo ); inserter.setInt( 5, lastUpdated.ordinal() ); inserter.executeUpdate(); } } catch ( EOFException e ) { r.close(); conn.close(); } } /** * Put the socket info into the the DB. * * @throws IOException if urls malformed or I/O trouble * @throws java.sql.SQLException if cannot write to database */ private static void applySockets() throws IOException, SQLException { // the socket info is not not the spec page. It it depends on grouping of MBs. manufacturer = Manufacturer.GIGABYTE; lastUpdated = BigDate.localToday(); conn = connect(); final PreparedStatement updater = conn.prepareStatement( "UPDATE mboards " + "SET socket=?, lastUpdated=? " + "WHERE manufacturer=? AND model=?;" ); for ( int i = 0; i < INDEX_URLS.length; i += 2 ) { socket = SocketType.valueOfAlias( INDEX_URLS[ i ] ); final String indexURL = INDEX_URLS[ i + 1 ]; final Get get = new Get(); final String htmlListOfGigabyteMBs = get.send( new URL( indexURL ), Get.UTF8 ); final int responseCode = get.getResponseCode(); if ( responseCode >= 300 || htmlListOfGigabyteMBs == null ) { err.println( "Could not fetch a master index Gigabyte page " + indexURL ); System.exit( 1 ); } final Matcher m = MB_FINDER.matcher( htmlListOfGigabyteMBs ); // Matchers are used both for matching and // finding. while ( m.find() ) { assert m.groupCount() == 3 : "bug in regex"; model = m.group( 1 ); revision = m.group( 2 ); // links to individual mbs look like this // http://www.gigabyte.com/products/product-page.aspx?pid=3748#sp manufacturerPartNo = m.group( 3 ); // e.g. http://www.gigabyte.com/products/product-page.aspx?pid=2849#sp // record our findings in SQL out.println( manufacturer.ordinal() + ", " + model + ", " + revision + ", " + manufacturerPartNo + "," + " false, " + lastUpdated + ", " + socket ); updater.setInt( 1, socket.ordinal() ); updater.setInt( 2, lastUpdated.ordinal() ); updater.setInt( 3, manufacturer.ordinal() ); updater.setString( 4, model ); updater.executeUpdate(); } } conn.close(); } /** * extract specs from previously downloaded raw specs * * @throws java.sql.SQLException if cannot write to SQL * @throws java.io.IOException if cannot read mb page. */ private static void extractSpecs() throws SQLException, IOException { manufacturer = Manufacturer.GIGABYTE; lastUpdated = BigDate.localToday(); initDatabase(); final PreparedStatement socketFetcher = conn.prepareStatement( "SELECT socket FROM mboards WHERE model=? AND revision=?" ); final File dir = new File( "E:/mb/gigabyte/" ); CSVReader r = new CSVReader( new BufferedReader( new FileReader( new File( sourceDir, "mother/gigabyte.csv" ) ) ) ); try { while ( true ) { r.skip( 1 ); // mfr model = r.get(); revision = r.get(); manufacturerPartNo = r.get(); r.skipToNextLine(); String rawMBSpecs = HunkIO.readEntireFile( new File( dir, model + ".html" ), HunkIO.UTF8 ); clearMBSpecs(); extractAudioChannels( rawMBSpecs, new Pattern[] { AUDIO_CHANNEL_FINDER } ); extractFormFactor( rawMBSpecs, FORM_FACTOR_FINDER, 1 ); extractIde( rawMBSpecs, new Pattern[] { IDE_FINDER } ); extractMemoryType( rawMBSpecs, MEMORY_TYPE_FINDER ); extractMaxGig( rawMBSpecs, MAX_GIG_FINDER ); extractRamSpeedMHz( rawMBSpecs, RAM_SPEED_MHZ_FINDER, RAM_SPEED_MHZ_SPLITTER ); extractSata2( rawMBSpecs, new Pattern[] { SATA2_FINDER_1, SATA2_FINDER_2 } ); extractSata3( rawMBSpecs, new Pattern[] { SATA3_FINDER } ); // get socket from database socketFetcher.setString( 1, model ); socketFetcher.setString( 2, revision ); socketFetcher.executeQuery(); final ResultSet rs = socketFetcher.getResultSet(); rs.next(); socket = SocketType.values()[ rs.getInt( 1 ) ]; rs.close(); extractUSB( rawMBSpecs, new Pattern[ 0 ], new Pattern[] { USB2_REAR_FINDER_1, USB2_REAR_FINDER_2 }, new Pattern[] { USB2_INTERNAL_FINDER_1, USB2_INTERNAL_FINDER_2 }, new Pattern[] { USB3_TOTAL_FINDER }, new Pattern[] { USB3_REAR_FINDER }, new Pattern[] { USB3_INTERNAL_FINDER } ); extractVideo( rawMBSpecs, new Pattern[] { VIDEO_FINDER_1, VIDEO_FINDER_2, VIDEO_FINDER_3, VIDEO_FINDER_4 } ); validateVideo( rawMBSpecs, VIDEO_VALIDATOR ); // no extractWatts since no info available. dumpExtracts(); updateMBFields(); } } catch ( EOFException e ) { out.println( incomplete + " incomplete records" ); r.close(); closeDatabase(); } } /** * handle oddities, typos on website etc. * * @throws java.sql.SQLException if cannot write exceptions to database */ private static void oddDucks() throws SQLException { manufacturer = Manufacturer.GIGABYTE; lastUpdated = BigDate.localToday(); conn = connect(); final Statement updater = conn.createStatement(); updater.executeUpdate( "UPDATE mboards " + "SET formFactor=" + FormFactor.ATX.ordinal() + ",widthInCm=30.5,heightInCm=22.4 " + "WHERE manufacturer=" + manufacturer.ordinal() + " AND model='GA-M750SLI-DS4'" ); final PreparedStatement SocketUpdater = conn.prepareStatement( "UPDATE mboards " + "SET socket=?, lastUpdated=? " + "WHERE manufacturer=? AND model=?;" ); // handle two socket exceptions. socket = SocketType.valueOfAlias( "Intel Atom" ); SocketUpdater.setInt( 1, socket.ordinal() ); SocketUpdater.setInt( 2, lastUpdated.ordinal() ); SocketUpdater.setInt( 3, manufacturer.ordinal() ); SocketUpdater.setString( 4, "GA-CG330UD" ); SocketUpdater.executeUpdate(); SocketUpdater.setString( 4, "GA-CG230D" ); SocketUpdater.executeUpdate(); conn.close(); } /** * extract specs from Gigagbyte website previously downloaded. * * @param args not used * * @throws java.io.IOException if can read mb pages. * @throws java.sql.SQLException if can't write to database */ public static void main( final String[] args ) throws IOException, SQLException { // fetchAllMBs(); // skeleton(); // applySockets(); extractSpecs(); oddDucks(); } //todo screenscrape the following fields. Export to database. //video }