/* * [HashBakeOff.java] * * Summary: Test hash functions for speed and ability to avoid collisions. * * Copyright: (c) 2014-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2014-04-26 initial version */ package com.mindprod.example; import com.mindprod.filter.OnlyDirectoriesFilter; import com.mindprod.filter.OnlyFilesFilter; import sun.misc.CRC16; import java.io.File; import java.io.FilenameFilter; import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.zip.Adler32; import java.util.zip.CRC32; import static java.lang.System.*; /* prints: Time to compute 25,000,000 iterations of 32-bit String.hashCode was 0.393 seconds. There were 0 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 32-bit FNV1a32 was 5.044 seconds. There were 1 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 64-bit FNV1a64 was 4.217 seconds. There were 0 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 32-bit SunCRC32 was 6.133 seconds. There were 1 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 32-bit Adler was 6.327 seconds. There were 946 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 128-bit MD5 was 12.967 seconds. There were 0 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 160-bit SHA-1 was 23.390 seconds. There were 0 collisions in 50,000 filenames ------ Time to compute 25,000,000 iterations of 160-bit SHA was 22.690 seconds. There were 0 collisions in 50,000 filenames ------ */ /** * interface that an algorithm must provide to be benchmarked */ interface HashAlgorithm { /** * identifying info */ String algorithmName(); /** * how many bits in the raw computed hash */ int bits(); /** * It need not include code to convert to string. * Fast version, used for timing */ void computeHash( String s ); /** * compute hash, must be converted to a String for collision checking. * Slow version, used for collision checking. */ String computeHashString( String s ); } /*** * [ public | protected | private ] * static * abstract * synchronized * [ transient | volatile ] * final * native */ /** * Test hash functions for speed and ability to avoid collisions. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2014-04-26 initial version * @since 2014-04-26 */ public final class HashBakeOff { /** * true if want extra progress messages */ private static final boolean DEBUGGING = false; /** * drive to look for filenames (will be unique) */ private static final char DRIVE_TO_LOOK_FOR_FILENAMES = 'E'; /** * how many filenames to computer hashes on */ private static final int COUNT_OF_FILES_TO_COLLECT = 50_000; /** * our collection of real world filenames to use in testing hashes */ private static final ArrayList COLLECTION = new ArrayList<>( COUNT_OF_FILES_TO_COLLECT ); /** * how many times to compute the hash of each file.. */ private static final int HEATS = 500; /** * how many hashes computer per algorithm. */ private static final int TRIALS = HEATS * COUNT_OF_FILES_TO_COLLECT; /** * format for times is seconds */ private static final DecimalFormat ELAPSED_TIME_FORMAT = new DecimalFormat( "###,##0.00" ); /** * format for integer counts */ private static final DecimalFormat INT_FORMAT = new DecimalFormat( "###,##0" ); /** * filter to get just dirs */ private static final FilenameFilter SAFE_DIR_FILTER = new OnlyDirectoriesFilter(); /** * filter to get just files */ private static final FilenameFilter SAFE_FILE_FILTER = new OnlyFilesFilter(); /** * how many filenames we have collected so far. */ private static int countOfFilesCollected = 0; /** * which algorith me are computing */ private static HashAlgorithm delegate = null; /** * collect some real world filename to test */ private static void collectFilenames() { out.println( "Collecting " + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT ) + " filenames from drive " + DRIVE_TO_LOOK_FOR_FILENAMES + "..." ); if ( !collectFilesFromDirTree( new File( DRIVE_TO_LOOK_FOR_FILENAMES + ":/" ) ) ) { err.println( "failed to collect sufficient filenames from drive " + DRIVE_TO_LOOK_FOR_FILENAMES + ":. wanted: " + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT ) + " collected: " + INT_FORMAT.format( countOfFilesCollected ) ); System.exit( 1 ); } assert countOfFilesCollected == COUNT_OF_FILES_TO_COLLECT : "bug: wrong number of filenames collected"; } // /method /** * recursively scan directory tree collecting filenames. * * @param theDir dir to scan * * @return true if collected enough files * @noinspection SameParameterValue, WeakerAccess */ private static boolean collectFilesFromDirTree( File theDir ) { if ( theDir == null ) { return true; } // clean out child dirs // all immediate child subdirs of this dir, excluding files in this immediate dir String[] allDirs = theDir.list( SAFE_DIR_FILTER ); if ( allDirs != null ) { for ( String subDir : allDirs ) { if ( collectFilesFromDirTree( new File( theDir, subDir ) ) ) { return true; } } } // now collect files in this immediate dir final String[] allFilesInDirToCollect = theDir.list( SAFE_FILE_FILTER ); if ( allFilesInDirToCollect != null ) { for ( String aFile : allFilesInDirToCollect ) { if ( collectOneFilename( new File( theDir, aFile ) ) ) { return true; } } } return false; } // /method /** * returns true when we gave collected enough files * * @param c filanem to collect * * @return true if we are done */ private static boolean collectOneFilename( File c ) { if ( ++countOfFilesCollected >= COUNT_OF_FILES_TO_COLLECT ) { return true; } COLLECTION.add( c.getAbsolutePath() ); return false; } // /method /** * compute and time the hashcodes for this algorithm. */ private static void computeHashes() { if ( DEBUGGING ) { out.println( "Computing " + INT_FORMAT.format( TRIALS ) + " hashes..." ); } long start = System.nanoTime(); for ( int heat = 0; heat < HEATS; heat++ ) { for ( String filename : COLLECTION ) { delegate.computeHash( filename ); } } long stop = System.nanoTime(); // nanoseconds are billionths, not millionths of seconds. // TimeUnit.NANSECONDS.toSeconds would not give us any decimal places. final double elapsedSeconds = ( double ) ( stop - start ) / 1_000_000_000d; out.println( "Time to compute " + INT_FORMAT.format( TRIALS ) + " iterations of " + delegate.bits() + "-bit " + delegate.algorithmName() + " was " + ELAPSED_TIME_FORMAT.format( elapsedSeconds ) + " seconds." ); } // /method /** * count numbmer of collision swith this algothithm. */ private static void countCollisions() { if ( DEBUGGING ) { out.println( "Checking " + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT ) + " filenames for collisions..." ); } // be generous. This is a one-shot program. HashSet collider = new HashSet<>( COUNT_OF_FILES_TO_COLLECT * 2 ); int collisionCount = 0; for ( String filename : COLLECTION ) { String hash = delegate.computeHashString( filename ); if ( !collider.add( hash ) ) { collisionCount++; } } out.println( "There were " + INT_FORMAT.format( collisionCount ) + " collisions in " + INT_FORMAT.format( COUNT_OF_FILES_TO_COLLECT ) + " filenames" ); } // /method /** * Collect a set of 20,000 real world filenames. * Compute hash of each one of the filenames, timing it. * In a second pass, record results in a bit map/HashMap looking for collisions. * * @param args not used */ public static void main( String[] args ) { // we collect a set of real filenames. collectFilenames(); HashAlgorithm[] algorithms = { /* best first, fastest, fewest collisions */ new SunHashCode(), new FNV1a32() /* has a collision */, new FNV1a64(), new CmpCRC16(), /* has many collisions */ new SunCRC32() /* has a collision */, new AdlerHash() /* has 946 collisions */, new MessageDigestHash( "MD5" ), new MessageDigestHash( "SHA-1" ), new MessageDigestHash( "SHA" ), new MessageDigestHash( "SHA-256" ), new MessageDigestHash( "SHA-384" ), new MessageDigestHash( "SHA-512" ), new SunCRC16() /* has collisions */, new MessageDigestHash( "MD2" ), }; out.println( "The best have the lowest elpased times and the fewest collisions." ); for ( HashAlgorithm algorithm : algorithms ) { delegate = algorithm; // make globally known if ( DEBUGGING ) { out.println( "\nBenchmarking " + delegate.bits() + "-bit " + delegate.algorithmName() + "..." ); } computeHashes(); countCollisions(); out.println( "------" ); } } // /method } // end class class SunHashCode implements HashAlgorithm { /** * identifying info */ public String algorithmName() { return "String.hashCode"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 32; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { // sun has the big advantage of working directly with a string. // it does not the the conversion to bytes first. @SuppressWarnings( "UnusedAssignment" ) int dummy = s.hashCode(); } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { return Integer.toString( s.hashCode() ); } // /method } // end class class AdlerHash implements HashAlgorithm { /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * don't allocate an new Adler32 for every calculation. */ private final Adler32 digester = new Adler32(); /** * identifying info */ public String algorithmName() { return "Adler"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 32; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); digester.getValue(); } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); return Long.toString( digester.getValue() ); } // /method } class SunCRC32 implements HashAlgorithm { /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * don't allocate an new Adler32 for every calculation. */ private final CRC32 digester = new CRC32(); /** * identifying info */ public String algorithmName() { return "SunCRC32"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 32; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); digester.getValue(); } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); return Long.toString( digester.getValue() ); } // /method } class SunCRC16 implements HashAlgorithm { /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * don't allocate an new Adler32 for every calculation. */ private final CRC16 digester = new CRC16(); /** * identifying info */ public String algorithmName() { return "SunCRC16"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 16; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); for ( byte b : theTextToDigestAsBytes ) { digester.update( b ); } @SuppressWarnings( "UnusedAssignment" ) final int dummy = digester.value; } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { // Note. There is no update byte[] method. // We get result with value, not getValue. digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); for ( byte b : theTextToDigestAsBytes ) { digester.update( b ); } return Integer.toString( digester.value ); // actually a short } // /method } class FNV1a32 implements HashAlgorithm { /** * prime specified to use in 32 bit hashes */ private static final int MAGIC_PRIME = 16_777_619; /* magic 32 bit FNV_prime = 224 + 28 + 0x93 = 16,777,619 */ /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * magic 32 bit initial value for the hash = 2,166,136,261 */ private static final int MAGIC_OFFSET = ( int ) ( 2_166_136_261L & 0xffffffffL ); /* fake unsigned 32 bit */ /** * Calc 34 bit FNV1a digest of an array of bytes. * * @param theTextToDigestAsBytes byte array to compute checksum on * * @return 64-bit signed fnv1a checksum */ private static long fnv1a64( byte[] theTextToDigestAsBytes ) { // you would think a 32-bit hash would work an int at a time, not a byte at a time. long hash = MAGIC_OFFSET; for ( byte b : theTextToDigestAsBytes ) { hash ^= b & 0xff; // account for strange signedness of byte hash *= MAGIC_PRIME; } return hash; } // /method /** * identifying info */ public String algorithmName() { return "FNV1a32"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 64; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); fnv1a64( theTextToDigestAsBytes ); } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); return Long.toString( fnv1a64( theTextToDigestAsBytes ) ); } // /method } class FNV1a64 implements HashAlgorithm { /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * magic 64 bit initial value for the hash = 14,695,981,039,346,656,037, needs to be unsigned to fit in long. */ @SuppressWarnings( "NumericOverflow" ) private static final long MAGIC_OFFSET = 14_695_981_039_346_656_03L * 10 + 7; /* 14_695_981_039_346_656_037L faked unsigned */ /** * prime specified to use in 64 bit hashes * magic 64 bit FNV_prime = 240 + 28 + 0xb3 = 1,099,511,628,211 */ private static final long MAGIC_PRIME = 1_099_511_628_211L; /** * Calc 64 bit FNV1a digest of an array of bytes. * * @param theTextToDigestAsBytes byte array to compute checksum on * * @return 64-bit signed fnv1a checksum */ private static long fnv1a64( byte[] theTextToDigestAsBytes ) { // you would think a 64 bit has work a long at a time, not a byte at a time. long hash = MAGIC_OFFSET; for ( byte b : theTextToDigestAsBytes ) { hash ^= b & 0xff; // account for strange signedness of byte hash *= MAGIC_PRIME; } return hash; } // /method /** * identifying info */ public String algorithmName() { return "FNV1a64"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 64; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); fnv1a64( theTextToDigestAsBytes ); } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); return Long.toString( fnv1a64( theTextToDigestAsBytes ) ); } // /method } class MessageDigestHash implements HashAlgorithm { /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); private final String algorithmName; private final MessageDigest digester; /** * constructor * * @param algorithmName name of algorithm to use MD2, MD5, SHA-1, SHA-256, SHA-384, SHA-512 */ MessageDigestHash( String algorithmName ) { this.algorithmName = algorithmName; // dodge to init final avoidng a missing/double assignment. MessageDigest init = null; try { init = MessageDigest.getInstance( algorithmName ); } catch ( NoSuchAlgorithmException e ) { err.println( "Missing " + algorithmName + "support " ); System.exit( 2 ); } digester = init; } // end constructor /** * identifying info */ public String algorithmName() { return algorithmName; } // /method /** * how many bits in the raw computed hash */ public int bits() { return digester.getDigestLength() * 8; } // /method /** * it need not include code to convert to string. USed for timing */ public void computeHash( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); @SuppressWarnings( "UnusedAssignment" ) byte[] dummy = digester.digest(); // ignore 16-byte result } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { digester.reset(); final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); digester.update( theTextToDigestAsBytes ); // 16-byte result, for MD5, 20 bytes for SHA byte[] digest = digester.digest(); return new String( digest, UTF8 ); // bytes are gibberish, not UTF-8 } // /method } // end class class CmpCRC16 implements HashAlgorithm { /** * generator polynomial */ private static final int POLY = 0x1021;/* x16 + x12 + x5 + 1 generator polynomial */ /* 0x8408 used in European X.25 */ /** * encoding for UTF-8 */ private static final Charset UTF8 = Charset.forName( "UTF-8" ); /** * scrambler lookup table for fast computation. */ private static final int[] CRC_TABLE = new int[ 256 ]; static { // initialise scrambler table for ( int i = 0; i < 256; i++ ) { int fcs = 0; int d = i << 8; for ( int k = 0; k < 8; k++ ) { if ( ( ( fcs ^ d ) & 0x8000 ) != 0 ) { fcs = ( fcs << 1 ) ^ POLY; } else { fcs = ( fcs << 1 ); } d <<= 1; fcs &= 0xffff; } CRC_TABLE[ i ] = fcs; } } /** * Calc 16-bit CRC with CCITT method. * * @param ba byte array to compute CRC on * * @return 16-bit CRC, unsigned */ private static int crc16( byte[] ba ) { // loop, calculating CRC for each byte of the string int work = 0xffff; for ( byte b : ba ) { // xor the next data byte with the high byte of what we have so far to // look up the scrambler. // xor that with the low byte of what we have so far. // Mask back to 16 bits. work = ( CRC_TABLE[ ( b ^ ( work >>> 8 ) ) & 0xff ] ^ ( work << 8 ) ) & 0xffff; } return work; } // /method /** * identifying info */ public String algorithmName() { return "CMP CRC16"; } // /method /** * how many bits in the raw computed hash */ public int bits() { return 16; } // /method /** * it need not include code to convert to string. Used for timing */ public void computeHash( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); int dummy = crc16( theTextToDigestAsBytes ); // ignore 16-byte result } // /method /** * compute hash, must be converted to a String for collision checking */ public String computeHashString( String s ) { final byte[] theTextToDigestAsBytes = s.getBytes( UTF8 ); return Integer.toString( crc16( theTextToDigestAsBytes ) ); } // /method }