/*
* [ScrapePopulation.java]
*
* Summary: Screenscrape population of each country from Wikipedia.
*
* Copyright: (c) 2015-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2015-09-28 initial release
* 1.1 2016-06-23 all for direct or indirect input.
*/
package com.mindprod.repair;
import com.mindprod.common18.EIO;
import com.mindprod.common18.ST;
import com.mindprod.csv.CSVWriter;
import com.mindprod.entities.DeEntify;
import com.mindprod.entities.Entify;
import com.mindprod.http.Get;
import com.mindprod.hunkio.HunkIO;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.System.*;
/**
* Screenscrape population of each country from Wikipedia.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.1 2016-06-23 all for direct or indirect input.
* @since 2015-09-28
*/
public class ScrapePopulation
{
/**
* when package released.
*
* @noinspection UnusedDeclaration
*/
private static final String RELEASE_DATE = "2016-06-23";
/**
* EMBEDDED VERSION STRING.
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String VERSION_STRING = "1.1";
private static final int FIRST_COPYRIGHT_YEAR = 2015;
/**
* undisplayed copyright notice
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String EMBEDDED_COPYRIGHT =
"Copyright: (c) 2015-2017 Roedy Green, Canadian Mind Products, http://mindprod.com";
private static final boolean DEBUGGING = false;
/**
* true probes wiki site.
* false loads pre-downloaded wiki file E:\com\mindprod\repair\rawpop.html
*/
private static final boolean DIRECT = true;
private static final String END_MARKER = "";
private static final String START_MARKER = "
\\s*" +
".+ | \\s*" + /* index */
"(.*) | \\s*" + /* country */
"([,\\d]+) | " /* pop */ );
/**
* get a lot of junk out the way to simplify the regex
*
* @param big contents of file
*
* @return cleaned contents file.
*/
private static String preclean( String big )
{
// clean out some of the trash to make regexes simpler.
big = stripHead( big );
big = stripTail( big );
big = stripDt( big );
big = stripA( big );
big = stripSp( big );
big = stripImg( big );
big = stripI( big );
return big;
}
// Needs Globalsign sha 256 G2 ssl root cert
private static String readWiki()
{
if ( DIRECT )
{
final Get get = new Get();
get.setInstanceFollowRedirects( true );
get.setConnectTimeout( ( int ) TimeUnit.SECONDS.toMillis( 40 ) );
try
{ // https://en.m.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
final String result = get.send( new URL( "https://en.m.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" ), Get.UTF8 );
if ( result == null )
{
err.println( "read of population data in wikipedia failed" );
System.exit( 2 );
}
return result;
}
catch ( MalformedURLException e )
{
err.println( ">>> bad url" );
System.exit( 2 );
return "";
}
}
else
{
try
{
return HunkIO.readEntireFile( new File( "E:/com/mindprod/repair/rawpop.html" ), HunkIO.UTF8 );
}
catch ( IOException e )
{
err.println( ">>> failure to read rawpop.html" );
System.exit( 2 );
return "";
}
}
}
/**
* strip out xxx but not xxx.
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripA( String big )
{
int resume = 0;
// where to continue looking for ", a + "
big = big.substring( 0, a ) + big.substring( a1 + 1 );
resume = a;
// keep xxxx but drop
int end = big.indexOf( "", resume );
if ( end < 0 )
{
err.println( "missing " );
System.exit( 1 );
}
big = big.substring( 0, end ) + big.substring( end + "".length() );
resume = end;
if ( DEBUGGING )
{
out.println( "b:" + big.substring( a, Math.min( a + 50, big.length() - 1 ) ) );
}
}
return big;
}
/**
* strip out --> |
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripDt( String big )
{
int resume = 0;
// where to continue looking for | ", td + " | -->
if ( DEBUGGING )
{
out.println( "b:" + ST.last( big.substring( 0, td + "".length();
if ( DEBUGGING )
{
out.println( "d:" + big.substring( td, Math.min( td + 50, big.length() - 1 ) ) );
}
}
return big;
}
/**
* strip off lead junk ahead of table
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripHead( String big )
{
int start = big.indexOf( START_MARKER );
if ( start < 0 )
{
err.println( "warning: missing start marker: " + START_MARKER );
return big;
}
return big.substring( start ) + START_MARKER.length();
}
/**
* strip off
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripI( String big )
{
return big.replaceAll( "|", "" );
}
/**
* strip out
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripImg( String big )
{
int resume = 0;
// where to continue looking for | ", im + "
big = big.substring( 0, im ) + big.substring( im1 + 1 );
resume = im;
if ( DEBUGGING )
{
out.println( "b:" + big.substring( im, Math.min( im + 50, big.length() - 1 ) ) );
}
}
return big;
}
/**
* strip out xxx but not xxx.
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripSp( String big )
{
int resume = 0;
// where to continue looking for ", s + "
big = big.substring( 0, s ) + big.substring( s1 + 1 );
resume = s;
// keep xxxx but drop
int end = big.indexOf( "", resume );
if ( end < 0 )
{
err.println( "missing " );
System.exit( 1 );
}
big = big.substring( 0, end ) + big.substring( end + "".length() );
resume = end;
if ( DEBUGGING )
{
out.println( "b:" + big.substring( s, Math.min( s + 50, big.length() - 1 ) ) );
}
}
return big;
}
/**
* strip off lead junk after table
*
* @param big string to strip stuff out of
*
* @return stripped contents of file
*/
private static String stripTail( String big )
{
int end = big.indexOf( END_MARKER );
if ( end < 0 )
{
err.println( "warning: missing end marker: " + END_MARKER );
return big;
}
return big.substring( 0, end );
}
/**
* extracts populations and country, screenscraping Wikipedia.
* from https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
* Result may be missing some contries, ones for which we have no population
*
* @param args none
*/
public static void main( String[] args ) throws IOException
{
final CSVWriter w = new CSVWriter( EIO.getPrintWriter( new File( "E:/com/mindprod/repair/pop.csv" ), 64 * 1024, EIO.UTF8 ) );
// must manually download contents of https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population to C:\temp\temp.html
String big = readWiki();
// if readWiki fails, use HunkIO.readEntireFile( new File( "E:/com/mindprod/repair/popraw.html" ) );
big = preclean( big );
if ( DEBUGGING )
{
out.println( "---------------" );
out.println( big );
}
final Matcher m = EXTRACTOR.matcher( big );
while ( m.find() )
{
String country = DeEntify.stripHTMLTags( m.group( 1 ) );
country = ST.chopLeadingString( country, " " );
country = ST.chopLeadingString( country, " " );
country = ST.chopLeadingString( country, " " );
country = ST.chopLeadingString( country, " " );
// remove [n]
int place0 = country.indexOf( '[' );
if ( place0 >= 0 )
{
int place1 = country.lastIndexOf( ']' );
if ( place1 >= 0 )
{
country = country.substring( 0, place0 ) + country.substring( place1 + 1 );
}
}
// remove (nnn}
int place2 = country.indexOf( '(' );
if ( place2 >= 0 )
{
int place3 = country.lastIndexOf( ')' );
if ( place3 >= 0 )
{
country = country.substring( 0, place2 ) + country.substring( place3 + 1 );
}
}
country = Entify.entifyHTML( country );
final String populationStr = ST.stripNaughtyCharacters( m.group( 2 ), "," );
final long population = Long.parseLong( populationStr );
w.put( country );
w.put( population );
w.put( country ); // put in twice so can convert to 3-letter code
w.nl();
}
out.println( w.getLineCount() + " country populations scraped" );
w.close();
}
}
| |