/* * [ScrapePopulation.java] * * Summary: Screenscrape population of each country from Wikipedia. * * Copyright: (c) 2015-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2015-09-28 initial release * 1.1 2016-06-23 all for direct or indirect input. */ package com.mindprod.repair; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import com.mindprod.csv.CSVWriter; import com.mindprod.entities.DeEntify; import com.mindprod.entities.Entify; import com.mindprod.http.Get; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Screenscrape population of each country from Wikipedia. * * @author Roedy Green, Canadian Mind Products * @version 1.1 2016-06-23 all for direct or indirect input. * @since 2015-09-28 */ public class ScrapePopulation { /** * when package released. * * @noinspection UnusedDeclaration */ private static final String RELEASE_DATE = "2016-06-23"; /** * EMBEDDED VERSION STRING. */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String VERSION_STRING = "1.1"; private static final int FIRST_COPYRIGHT_YEAR = 2015; /** * undisplayed copyright notice */ @SuppressWarnings( { "UnusedDeclaration" } ) private static final String EMBEDDED_COPYRIGHT = "Copyright: (c) 2015-2017 Roedy Green, Canadian Mind Products, http://mindprod.com"; private static final boolean DEBUGGING = false; /** * true probes wiki site. * false loads pre-downloaded wiki file E:\com\mindprod\repair\rawpop.html */ private static final boolean DIRECT = true; private static final String END_MARKER = ""; private static final String START_MARKER = "Source"; // search for // // 253 // Cocos (Keeling) Islands (Australia) // 550 // August 9, 2011 // 0.0000076% // 2011 census result // private static final Pattern EXTRACTOR = Pattern.compile( "\\s*" + ".+\\s*" + /* index */ "(.*)\\s*" + /* country */ "([,\\d]+)" /* pop */ ); /** * get a lot of junk out the way to simplify the regex * * @param big contents of file * * @return cleaned contents file. */ private static String preclean( String big ) { // clean out some of the trash to make regexes simpler. big = stripHead( big ); big = stripTail( big ); big = stripDt( big ); big = stripA( big ); big = stripSp( big ); big = stripImg( big ); big = stripI( big ); return big; } // Needs Globalsign sha 256 G2 ssl root cert private static String readWiki() { if ( DIRECT ) { final Get get = new Get(); get.setInstanceFollowRedirects( true ); get.setConnectTimeout( ( int ) TimeUnit.SECONDS.toMillis( 40 ) ); try { // https://en.m.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population final String result = get.send( new URL( "https://en.m.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" ), Get.UTF8 ); if ( result == null ) { err.println( "read of population data in wikipedia failed" ); System.exit( 2 ); } return result; } catch ( MalformedURLException e ) { err.println( ">>> bad url" ); System.exit( 2 ); return ""; } } else { try { return HunkIO.readEntireFile( new File( "E:/com/mindprod/repair/rawpop.html" ), HunkIO.UTF8 ); } catch ( IOException e ) { err.println( ">>> failure to read rawpop.html" ); System.exit( 2 ); return ""; } } } /** * strip out xxx but not xxx. * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripA( String big ) { int resume = 0; // where to continue looking for ", a + " big = big.substring( 0, a ) + big.substring( a1 + 1 ); resume = a; // keep xxxx but drop int end = big.indexOf( "", resume ); if ( end < 0 ) { err.println( "missing " ); System.exit( 1 ); } big = big.substring( 0, end ) + big.substring( end + "".length() ); resume = end; if ( DEBUGGING ) { out.println( "b:" + big.substring( a, Math.min( a + 50, big.length() - 1 ) ) ); } } return big; } /** * strip out --> * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripDt( String big ) { int resume = 0; // where to continue looking for ", td + " -->
if ( DEBUGGING ) { out.println( "b:" + ST.last( big.substring( 0, td + "".length(); if ( DEBUGGING ) { out.println( "d:" + big.substring( td, Math.min( td + 50, big.length() - 1 ) ) ); } } return big; } /** * strip off lead junk ahead of table * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripHead( String big ) { int start = big.indexOf( START_MARKER ); if ( start < 0 ) { err.println( "warning: missing start marker: " + START_MARKER ); return big; } return big.substring( start ) + START_MARKER.length(); } /** * strip off * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripI( String big ) { return big.replaceAll( "|", "" ); } /** * strip out * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripImg( String big ) { int resume = 0; // where to continue looking for ", im + " big = big.substring( 0, im ) + big.substring( im1 + 1 ); resume = im; if ( DEBUGGING ) { out.println( "b:" + big.substring( im, Math.min( im + 50, big.length() - 1 ) ) ); } } return big; } /** * strip out xxx but not xxx. * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripSp( String big ) { int resume = 0; // where to continue looking for ", s + " big = big.substring( 0, s ) + big.substring( s1 + 1 ); resume = s; // keep xxxx but drop int end = big.indexOf( "", resume ); if ( end < 0 ) { err.println( "missing " ); System.exit( 1 ); } big = big.substring( 0, end ) + big.substring( end + "".length() ); resume = end; if ( DEBUGGING ) { out.println( "b:" + big.substring( s, Math.min( s + 50, big.length() - 1 ) ) ); } } return big; } /** * strip off lead junk after table * * @param big string to strip stuff out of * * @return stripped contents of file */ private static String stripTail( String big ) { int end = big.indexOf( END_MARKER ); if ( end < 0 ) { err.println( "warning: missing end marker: " + END_MARKER ); return big; } return big.substring( 0, end ); } /** * extracts populations and country, screenscraping Wikipedia. * from https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population * Result may be missing some contries, ones for which we have no population * * @param args none */ public static void main( String[] args ) throws IOException { final CSVWriter w = new CSVWriter( EIO.getPrintWriter( new File( "E:/com/mindprod/repair/pop.csv" ), 64 * 1024, EIO.UTF8 ) ); // must manually download contents of https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population to C:\temp\temp.html String big = readWiki(); // if readWiki fails, use HunkIO.readEntireFile( new File( "E:/com/mindprod/repair/popraw.html" ) ); big = preclean( big ); if ( DEBUGGING ) { out.println( "---------------" ); out.println( big ); } final Matcher m = EXTRACTOR.matcher( big ); while ( m.find() ) { String country = DeEntify.stripHTMLTags( m.group( 1 ) ); country = ST.chopLeadingString( country, " " ); country = ST.chopLeadingString( country, " " ); country = ST.chopLeadingString( country, " " ); country = ST.chopLeadingString( country, " " ); // remove [n] int place0 = country.indexOf( '[' ); if ( place0 >= 0 ) { int place1 = country.lastIndexOf( ']' ); if ( place1 >= 0 ) { country = country.substring( 0, place0 ) + country.substring( place1 + 1 ); } } // remove (nnn} int place2 = country.indexOf( '(' ); if ( place2 >= 0 ) { int place3 = country.lastIndexOf( ')' ); if ( place3 >= 0 ) { country = country.substring( 0, place2 ) + country.substring( place3 + 1 ); } } country = Entify.entifyHTML( country ); final String populationStr = ST.stripNaughtyCharacters( m.group( 2 ), "," ); final long population = Long.parseLong( populationStr ); w.put( country ); w.put( population ); w.put( country ); // put in twice so can convert to 3-letter code w.nl(); } out.println( w.getLineCount() + " country populations scraped" ); w.close(); } }