/*
* [SiteMap.java]
*
* Summary: Prepares a google SiteMap for mindprod.com.
*
* Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2006-01-04 original
* 1.1 2006-01-04 add duplicate elimination
* 1.2 2006-01-04 add txt files.
* 1.3 2006-01-05 use properties and csv files
* 1.4 2006-01-05 check frequency types for validity, check for completeness.
* 1.5 2006-03-05 reformat with IntelliJ, add Javadoc.
* 1.6 2006-03-05
* 1.7 2007-09-26 change default sizes.
* 1.8 2009-05-10 add list of excludes, and includes, new header.
* 1.9 2010-12-01 improve error messages. schema validation in header.
*/
package com.mindprod.sitemap;
import com.mindprod.common18.Build;
import com.mindprod.common18.EIO;
import com.mindprod.csv.CSVReader;
import com.mindprod.fastcat.FastCat;
import com.mindprod.filter.StartAndEndsWithFilter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Properties;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
import static java.lang.System.*;
/**
* Prepares a google SiteMap for mindprod.com
*
* created with Intellij Idea
*
* @author Roedy Green, Canadian Mind Products
*/
@SuppressWarnings( { "ALL" } )
enum Frequency
{
/**
* generated, always different
*/
always,
/**
* every day
*/
daily,
/**
* every hour
*/
hourly,
/**
* every month
*/
monthly,
/**
* archived
*/
never,
/**
* every week
*/
weekly,
/**
* every year
*/
yearly
}
/**
* Prepares a google SiteMap for mindprod.com.
*
* To customise for your own use, modify configure sitemapconfig.properties,
* and prepare a files.csv and directories .csv file.
*
* @author Roedy Green, Canadian Mind Products
* @version 1.9 2010-12-01 improve error messages. schema validation in header.
* @since 2006-01-04
*/
public final class SiteMap
{
private static final int BUFFSIZE = 64 * 1024;
private static final int FIRST_COPYRIGHT_YEAR = 2006;
private static final String DIRECTORIES_FILE = "directories.csv";
/**
* Template for no decimal places.
*/
private static final DecimalFormat DF0 = new DecimalFormat( "###,##0" );
/**
* undisplayed copyright notice
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String EMBEDDED_COPYRIGHT =
"Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com";
/**
* name of files where patterns of files to exclude are
*/
private static final String EXCLUDES_FILE = "excludes.csv";
/**
* filename of list of extra files not in one of usual directories
*/
private static final String EXTRA_FILES_FILE = "files.csv";
/**
* filename of generated compressed sitemap
*/
private static final String GZ_FILE = "sitemap.gz";
/**
* name of files where patterns of files to include are
*/
private static final String INCLUDES_FILE = "includes.csv";
/**
* filename of uncompressed version of the sitemap
*/
private static final String PROOF_FILE = "sitemap.proof";
/**
* filename of configuration properties
*/
private static final String PROPERTIES_FILE = "sitemapconfig.properties";
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String RELEASE_DATE = "2019-12-01";
/**
* embedded version string.
*/
@SuppressWarnings( { "UnusedDeclaration" } )
private static final String VERSION_STRING = "1.9";
/**
* FormatPadSites dates in ISO YYYY-MM-DD form
*/
private static final SimpleDateFormat sdf =
new SimpleDateFormat( "yyyy-MM-dd" );
/**
* filter to remove files we don't want Google to index.
* This is the default setting ending with these extensions.
* They will all normally be overridden by data in the includes and excludes files.
*/
private static final StartAndEndsWithFilter fileFilter =
new StartAndEndsWithFilter( null, new String[] { ".html", ".txt", ".htm", ".pdf" }, null,
null, null, null );
/**
* how many bytes in the googlesitemap.gz file compressed.
*/
private static int compressed_bytes_written;
/**
* track files already written so won't redo ones already handled.
*/
private static HashSet doneFiles;
/**
* High side estimate of how many files will will write entries for. , replaced by properties
*/
private static int estimatedFilesToCatalog = 5500;
/**
* HowToProcess big the uncompressed generated file will be. Compression is about 25 to 1. Just an estimate that
* should be on the high side., replaced by properties
*/
private static int estimatedUncompressedFileSize = 750000;
/**
* how many files were cataloged in googlesitemap.gz.
*/
private static int filesCataloged;
/**
* where website files are generated before being uploaded, replaced by properties
*/
private static String webRoot = Build.MINDPROD_WEBROOT;
/**
* name of website we are preparing files for, replaced by properties
*/
private static String websiteHost = "mindprod.com";
/**
* constructor not used
*/
private SiteMap()
{
}
/**
* generate and append all the XML for all the *.html and *.txt files in the directories specified in
* directories.csv
*
* @param sb where to append to.
*/
private static void appendDirectories( final StringBuilder sb )
{
try
{
final CSVReader r = new CSVReader( new BufferedReader( new FileReader( DIRECTORIES_FILE ) ) );
try
{
while ( true )
{
final String dir = r.get();
final String frequencyString = r.get();
final String priority = r.get();
if ( dir == null
|| frequencyString == null
|| priority == null )
{
throw new IllegalArgumentException(
"incomplete record (possible blank line) in directories.csv near directory "
+ dir
);
}
Frequency frequency;
try
{
frequency = Frequency.valueOf( frequencyString );
}
catch ( IllegalArgumentException e )
{
throw new IllegalArgumentException( "bad frequency "
+ frequencyString
+ " near directory "
+ dir );
}
sb.append( oneDir( dir, frequency, priority ) );
r.skipToNextLine();
}
}
catch ( EOFException e )
{
r.close();
}
}
catch ( IOException e )
{
throw new IllegalArgumentException( "trouble reading "
+ new File( DIRECTORIES_FILE ).getAbsolutePath()
+ " "
+ e.getMessage() );
}
}
/**
* generate and append all the XML for all the files specified in directories.csv
*
* @param sb where to append to.
*/
private static void appendFiles( final StringBuilder sb )
{
try
{
final CSVReader r = new CSVReader( new BufferedReader( new FileReader( EXTRA_FILES_FILE ) ) );
try
{
while ( true )
{
final String dir = r.get();
final String file = r.get();
final String frequencyString = r.get();
final String priority = r.get();
if ( dir == null
|| file == null
|| frequencyString == null
|| priority == null )
{
throw new IllegalArgumentException(
"incomplete record near near file "
+ dir
+ "/"
+ file
);
}
final Frequency frequency;
try
{
frequency = Frequency.valueOf( frequencyString );
}
catch ( IllegalArgumentException e )
{
throw new IllegalArgumentException( "bad frequency "
+ frequencyString
+ " near file "
+ dir
+ "/"
+ file );
}
sb.append( oneFile( dir, file, frequency, priority ) );
r.skipToNextLine();
}
}
catch ( EOFException e )
{
r.close();
}
}
catch ( IOException e )
{
throw new IllegalArgumentException( "trouble reading "
+ new File( EXTRA_FILES_FILE ).getAbsolutePath()
+ " "
+ e.getMessage() );
}
}
/**
* footer for whole file
*
* @return xml to finish the file
*/
private static String footer()
{
return "\n";
}
/**
* read sitemapconfig.properties file from current directory
*/
private static void getSiteMapConfigProperties()
{
Properties props = new Properties();
try
{
FileInputStream fis = new FileInputStream( PROPERTIES_FILE );
props.load( fis );
fis.close();
}
catch ( IOException e )
{
throw new IllegalArgumentException( "Missing " + new File(
PROPERTIES_FILE ).getAbsolutePath() );
}
// read the four properties.
webRoot = props.getProperty( "webRoot", Build.MINDPROD_WEBROOT );
websiteHost = props.getProperty( "websiteHost", "mindprod.com" );
try
{
estimatedFilesToCatalog =
Integer.parseInt( props.getProperty(
"estimatedFilesToCatalog",
"5500" ) );
}
catch ( NumberFormatException e )
{
throw new IllegalArgumentException(
"estimatedTotalFiles must be numeric." );
}
try
{
estimatedUncompressedFileSize = Integer.parseInt( props
.getProperty( "estimatedUncompressedFileSize", "750000" ) );
}
catch ( NumberFormatException e )
{
throw new IllegalArgumentException(
"estimatedUncompressedFileSize must be numeric." );
}
}
/**
* generate XML header for whole file
*
* @return xml to introduce the file
*/
private static String header()
{
return "\n"
+ "\n";
}
/**
* generate XML for one directory, and all its includes files that pass the filter
*
* @param dir directory relative to web root
* @param frequency e.g. always, hourly, daily, weekly, monthly,yearly,
* never. see https://www.google.com/webmasters/sitemaps/docs/en/faq.html
* @param priority relative important 0..1, .5 = average see https://www.google
* .com/webmasters/sitemaps/docs/en/faq.html
*
* @return xml to for one URL
*/
private static String oneDir( String dir,
Frequency frequency,
String priority )
{
final File directory = new File( webRoot, dir );
// pick out .html .html .txt but not the various excludes.
String[] files = directory.list( fileFilter );
if ( files == null )
{
err.println( "Warning: directory " + EIO.getCanOrAbsPath( directory ) + " not found" );
return "";
}
final StringBuilder sb = new StringBuilder( files.length * 200 );
for ( String file : files )
{
sb.append( oneFile( dir, file, frequency, priority ) );
}
return sb.toString();
}
/**
* generate XML for for one file. Returns "" if we have done this file before.
*
* @param dir directory relative to web root
* @param file unqualified filename including .html extension.
* @param frequency e.g. always, hourly, daily, weekly, monthly,yearly,
* never. https://www.google.com/webmasters/sitemaps/docs/en/faq.html
* @param priority relative important 0..1, .5 = average https://www.google.com/webmasters/sitemaps/docs/en/faq
* .html
*
* @return xml to for one URL
*/
private static String oneFile( String dir,
String file,
Frequency frequency,
String priority )
{
// If we have done this file already, don't do it again.
if ( !doneFiles.add( dir + "/" + file ) )
{
return "";
}
filesCataloged++;
final File theFile = new File( webRoot + "/" + dir, file );
if ( !theFile.exists() )
{
throw new IllegalArgumentException( "No such file: "
+ dir
+ "/"
+ file );
}
// Get the date from the file on local hard disk.
long lastMod = theFile.lastModified();
// use default TimeZone
String dateString = sdf.format( new Date( lastMod ) );
return "\n"
+ "http://"
+ websiteHost
+ "/"
+ dir
+ ( dir.length() > 0 ? "/" : "" )
+ file
+ "\n"
+ ""
+ dateString
+ "\n"
+ ""
+ frequency
+ "\n"
// leave priority out if it is the default .5
+ ( priority.equals( ".5" )
? ""
: "" + priority + "\n" )
+ "\n";
}
/**
* read list of patterns to exclude Google from indexing
*/
private static void readExcludes()
{
final File excludes = new File( EXCLUDES_FILE );
// excludes file is optional.
if ( !excludes.exists() )
{
out.println( "warning: no excludes.csv file. No excludes presumed." );
return;
}
ArrayList starts = new ArrayList<>( 11 );
ArrayList ends = new ArrayList<>( 11 );
ArrayList regexes = new ArrayList<>( 11 );
try
{
final CSVReader r = new CSVReader( new BufferedReader( new FileReader( excludes ) ) );
try
{
while ( true )
{
final String type = r.get();
final String s = r.get();
if ( type.startsWith( "start" ) )
{
starts.add( s.toLowerCase() );
}
else if ( type.startsWith( "end" ) )
{
ends.add( s.toLowerCase() );
}
else if ( type.startsWith( "regex" ) )
{
regexes.add( Pattern.compile( s ) );
}
else
{
throw new IllegalArgumentException( "invalid category: " + s + " It should be startsWith, " +
"endsWith or regex." );
}
r.skipToNextLine();
}
}
catch ( EOFException e )
{
r.close();
}
fileFilter.setExcludeStartsWith( starts.toArray( new String[ starts.size() ] ) );
fileFilter.setExcludeEndsWith( ends.toArray( new String[ ends.size() ] ) );
fileFilter.setExcludeRegexMatch( regexes.toArray( new Pattern[ regexes.size() ] ) );
}
catch ( IOException e )
{
throw new IllegalArgumentException( "trouble reading excludes.csv"
+ " "
+ e.getMessage() );
}
}
/**
* read list of patterns to include in Google indexing
*/
private static void readIncludes()
{
final File includes = new File( INCLUDES_FILE );
// includes file is optional.
if ( !includes.exists() )
{
err.println( "warning: no includes.csv file, default set assumed." );
return;
}
final ArrayList starts = new ArrayList<>( 11 );
final ArrayList ends = new ArrayList<>( 11 );
final ArrayList regexes = new ArrayList<>( 11 );
try
{
final CSVReader r = new CSVReader( new BufferedReader( new FileReader( includes ) ) );
try
{
while ( true )
{
final String type = r.get();
final String s = r.get();
if ( type.startsWith( "start" ) )
{
starts.add( s.toLowerCase() );
}
else if ( type.startsWith( "end" ) )
{
ends.add( s.toLowerCase() );
}
else if ( type.startsWith( "regex" ) )
{
regexes.add( Pattern.compile( s ) );
}
else
{
throw new IllegalArgumentException( "invalid category: " + s + " It should be startsWith, " +
"endsWith or regex." );
}
r.skipToNextLine();
}
}
catch ( EOFException e )
{
r.close();
}
fileFilter.setIncludeStartsWith( starts.toArray( new String[ starts.size() ] ) );
fileFilter.setIncludeEndsWith( ends.toArray( new String[ ends.size() ] ) );
fileFilter.setIncludeRegexMatch( regexes.toArray( new Pattern[ regexes.size() ] ) );
}
catch ( IOException e )
{
throw new IllegalArgumentException( "trouble reading includes.csv"
+ " "
+ e.getMessage() );
}
}
/**
* Generate googlesitemap.gz Google Site Map file. This is set up for mindprod.com. Modify the code to suit
* yourself.
*
* @param args not used.
*/
public static void main( String[] args )
{
// get four configuration parameters
getSiteMapConfigProperties();
readIncludes();
readExcludes();
doneFiles = new HashSet<>( estimatedFilesToCatalog );
final StringBuilder sb = new StringBuilder( estimatedUncompressedFileSize );
sb.append( header() );
// append files first so they will take priority over what directory
// says about the file.
appendFiles( sb ); // individual files list
appendDirectories( sb ); // all files in specified directories.
sb.append( footer() );
final int uncompressedCharsWritten = sb.length();
try
{
// write out everything we have generated.
// proof file for us to have a look
final String forGoogle = sb.toString();
BufferedWriter proof = new BufferedWriter( new FileWriter( PROOF_FILE ), BUFFSIZE );
proof.write( forGoogle );
proof.close();
// compressed google file
final File google = new File( webRoot, GZ_FILE );
FileOutputStream fos = new FileOutputStream( google );
GZIPOutputStream gzos = new GZIPOutputStream( fos, BUFFSIZE );
OutputStreamWriter eosw = new OutputStreamWriter( gzos, EIO.UTF8 );
eosw.write( forGoogle );
eosw.close();
compressed_bytes_written = ( int ) google.length();
}
catch ( IOException e )
{
err.println();
e.printStackTrace( err );
err.println( "Problems creating " + GZ_FILE );
err.println();
System.exit( 1 );
}
final FastCat summary = new FastCat( 17 );
summary.append( GZ_FILE, " successfully created for ", websiteHost, "\n" );
summary.append( " ", DF0.format( filesCataloged ), " files cataloged (" );
summary.append( DF0.format( estimatedFilesToCatalog ), " estimated)\n" );
summary.append( " ", DF0.format( uncompressedCharsWritten ), " uncompressed chars written (" );
summary.append( DF0.format( estimatedUncompressedFileSize ), " estimated)\n" );
summary.append( " ", DF0.format( compressed_bytes_written ), " compressed bytes written" );
out.println( summary.toString() );
} // end main
}