/* * [TestUTF8.java] * * Summary: Discover how Java's use of UTF-8 conforms with Unicode standards. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2006-02-25 */ // TestUTF8 package com.mindprod.example; import com.mindprod.common18.EIO; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import static java.lang.System.*; /** * Discover how Java's use of UTF-8 conforms with Unicode standards. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2006-02-25 * @since 2006-02-25 */ public final class TestUTF8 { /** * byte order mark as a character. */ private static final char BOM = ( char ) 0xfeff; /** * TEST strange use to check out how Sun encodes UTF-8 */ private static final String TEST = new StringBuilder().append( "bom:" ) .append( BOM ) .append( " text:echidna" ) .append( " x0:" ) .append( ( char ) 0x00 ) .append( " xa1:" ) .append( ( char ) 0xa1 ) .append( " x100:" ) .append( ( char ) 0x100 ) .append( " x0911:" ) .append( ( char ) 0x0911 ) .append( " xffff:" ) .append( ( char ) 0xffff ) .append( " || 32 bit || x10000:" ) .appendCodePoint( 0x10000 ) .append( " x10302:" ) .appendCodePoint( 0x10302 ) .append( " x1ffff:" ) .appendCodePoint( 0x1ffff ) .append( " x100000:" ) .appendCodePoint( 0x100000 ) .append( " x10ffff:" ) .appendCodePoint( 0x10ffff ) .toString(); /** * dump contents of file in hex * * @param bb ByteBuffer as raw bytes, e.g. ByteBuffer or MappedByteBuffer */ private static void examine( ByteBuffer bb ) { out.println( "position: " + bb.position() ); out.println( "limit: " + bb.limit() ); out.println( "capacity: " + bb.capacity() ); int limit = bb.limit(); for ( int offset = 0; offset < limit; offset++ ) { int c = bb.get() & 0xff;// want to view unsigned // offset, hex, decimal char out.printf( "%6d > %2x : %3d : %1c\n", offset, c, c, ( char ) c ); } } /** * Test CharBuffer.getBytes * * @throws java.io.IOException on I/O failure. */ private static void testCharBuffer() throws IOException { // CharBuffer-ByteBuffer style encoding/decoding. // Using nio methods for encoding and decoding. // These are more efficient because there is less // hidden copying of the data as there is when you work // with String and byte[] in the previous methods. // choose an encoding Charset utf8 = Charset.forName( "UTF-8" ); // for byte to char CharsetDecoder decoder = utf8.newDecoder(); // for char to byte CharsetEncoder encoder = utf8.newEncoder(); // effectively convert char[] to byte[] ByteBuffer encoded = encoder.encode( CharBuffer.wrap( TEST ) ); // effectively convert byte[] to char[] CharBuffer charBuffer = decoder.decode( encoded ); String reconstitutedTest = charBuffer.toString(); if ( !reconstitutedTest.equals( TEST ) ) { out.println( "oops: charBuffer differs from original" ); } out.println( "<><> charBuffer <><>" ); out.println( "String length: " + TEST.length() + " UTF-8 length: " + encoded.limit() + " reconstituted length: " + reconstitutedTest.length() ); encoded.flip();// prepare to read examine( encoded ); /* From the output we make the following discoveries. This works just like OutputStreamWriter. There is no BOM on the front of the file, unless you write one there. It does not insert or remove any BOMs. 0 is encoded in a single byte, as per UTF standard. There is no length on the front of the string */ } /** * Test String.getBytes * * @throws java.io.IOException on I/O failure. */ private static void testGetBytes() throws IOException { byte[] encoded = TEST.getBytes( EIO.UTF8 ); String reconstitutedTest = new String( encoded, EIO.UTF8 ); if ( !reconstitutedTest.equals( TEST ) ) { out.println( "oops: getBytes differs from original" ); } out.println( "<><> getBytes <><>" ); out.println( "String length: " + TEST.length() + " UTF-8 length: " + encoded .length + " reconstituted length: " + reconstitutedTest.length() ); ByteBuffer encodedBuffer = ByteBuffer.wrap( encoded ); // encodedBuffer.flip(); // not needed examine( encodedBuffer ); /* From the output we make the following discoveries. This works just like OutputStreamWriter. There is no BOM on the front of the file, unless you write one there. It does not insert or remove any BOMs. 0 is encoded in a single byte, as per UTF standard. There is no length on the front of the string. */ } /** * Test OutputStreamWriter * * @throws java.io.IOException on I/O failure. */ private static void testOutputStreamWriter() throws IOException { File tempFile = File.createTempFile( "temp_", "tmp" ); // O P E N for write FileOutputStream fos = new FileOutputStream( tempFile, false/* no append */ ); OutputStreamWriter osw = new OutputStreamWriter( fos, EIO.UTF8 ); // W R I T E osw.write( TEST ); // C L O S E osw.close(); // O P E N for read FileInputStream fis = new FileInputStream( tempFile ); InputStreamReader isw = new InputStreamReader( fis, EIO.UTF8 ); // R E A D char[] cbuf = new char[ TEST.length() ]; int charsRead = isw.read( cbuf ); String reconstitutedTest = new String( cbuf, 0, charsRead ); if ( !reconstitutedTest.equals( TEST ) ) { out.println( "oops: InputStreamReader differs from original" ); } // C L O S E osw.close(); out.println( "<><> OutputStreamWriter <><>" ); out.println( "String length: " + TEST.length() + " UTF-8 length: " + tempFile.length() + " reconstituted length: " + reconstitutedTest.length() ); fis = new FileInputStream( tempFile ); FileChannel fc = fis.getChannel(); ByteBuffer encodedBuffer = fc.map( FileChannel.MapMode.READ_ONLY, 0, tempFile.length() ); // encodedBuffer.flip(); // not needed examine( encodedBuffer ); fc.close(); fis.close(); //noinspection ResultOfMethodCallIgnored,ResultOfMethodCallIgnored tempFile.delete(); /* From this output we make these discoveries. There is no BOM on the front of the file, unless you write one there. It does not insert or remove any BOMs. 0 is encoded in a single byte, as per UTF standard. There is no length on the front of the string. */ } /** * Test DataOutputStream.writeUTF * * @throws java.io.IOException on I/O failure */ private static void testWriteUTF() throws IOException { File tempFile = File.createTempFile( "temp_", "tmp" ); // O P E N for write FileOutputStream fos = new FileOutputStream( tempFile, false/* append */ ); DataOutputStream dos = new DataOutputStream( fos ); // W R I T E dos.writeUTF( TEST ); // C L O S E dos.close(); // O P E N for read FileInputStream fis = new FileInputStream( tempFile ); DataInputStream dis = new DataInputStream( fis ); // W R I T E String reconstitutedTest = dis.readUTF(); if ( !reconstitutedTest.equals( TEST ) ) { out.println( "oops: readUTF differs from original" ); } // C L O S E dis.close(); out.println( "<><> DataOutputStream.writeUTF <><>" ); out.println( "String length: " + TEST.length() + " UTF-8 length: " + tempFile.length() + " reconstituted length: " + reconstitutedTest.length() ); fis = new FileInputStream( tempFile ); FileChannel fc = fis.getChannel(); ByteBuffer encodedBuffer = fc.map( FileChannel.MapMode.READ_ONLY, 0, tempFile.length() ); // encodedBuffer.flip(); // not needed examine( encodedBuffer ); fc.close(); fis.close(); tempFile.delete(); /* From this output we discover these differences from true UTF-8 encoding. There is a signed two-byte big-endian int length on the front, that counts the size of the following field in bytes not chars. It does not include itself. This means strings are limited to a mere 32767 bytes long which is even fewer characters!! Somebody goofed big-time here. There should be scheme to write longer Strings. 0x00 is encoded as is c0 80 instead of 00, to help C from getting confused reading such a file and thinking the 00 meant end-of-string. The biggest difference is the handling of 32 bit code points. UTF-8 codes them as 4-byte sequences. Sun is coding them as 6-byte sequences. e.g. consider the encoding of 0x10302 standard UTF-8 gives f0 90 8c 82 whereas under Sun's writeUTF scheme item encodes as: ed a0 80 ed bc 82 What is going on? Internally Sun encodes 32-bit codepoints as a surrogate pair of 16-bit chars, effectively using UTF-16 encoding internally. Instead of undoing the UTF-16 encoding before applying the UTF-8 transform, Sun applies item directly on the surrogate pairs. Surrogate pairs are in the bands 0xd800-0xdbff and 0xdc00-0xdfff. Treated as ordinary characters, these take 3 bytes each to encode each character separately in UTF-8. It does not insert or remove any BOMs. */ } /** * Examines Java's various UTF implementations for conformance with Unicode Standards. * * @param args not used * * @throws java.io.IOException on I/O failure */ public static void main( String[] args ) throws IOException { testCharBuffer(); testGetBytes(); testWriteUTF(); testOutputStreamWriter(); } }