/* * [UTF8Encoder.java] * * Summary: write/encode String into UTF-8 encoded bytes, without using Java's built-in encoders. * * Copyright: (c) 2009-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2006-02-24 */ // UTF8Encoder package com.mindprod.example; import com.mindprod.common18.EIO; import java.io.UnsupportedEncodingException; import static java.lang.System.*; /** * write/encode String into UTF-8 encoded bytes, without using Java's built-in encoders. *

* Gives an 8-bit byte array. * * @author Roedy Green, Canadian Mind Products * @version 1.0 2006-02-24 * @since 2006-02-24 */ public final class UTF8Encoder { /** * true if you want the TEST harness to ensure this code works. */ private static final boolean DEBUGGING = false; /** * byte order mark as a character. */ private static final char BOM = ( char ) 0xfeff; /** * encode a String into UTF-8 bytes. We handle only 16-bit chars. *

*

* UTF-8 is normally encoded simply with String.getBytes( "UTF-8") or with an OutputStreamWriter but this is roughly * what goes on under the hood, if you ever need to write your own encoder for some non-Java platform, or you are * just curious how it works. *

* This works for 16-bit characters only. It does not handle 32-bit characters encoded with the contortionist use of * the low (0xdc00..0xdfff) and high(0xd800..0xdbff) bands of surrogate characters. * * @param input string to encoded with UTF-8. * * @return string encoded in UTF-8 byte string. */ private static byte[] encode( String input ) { // worst case, all chars could require 3-byte encodings. byte[] output = new byte[ input.length() * 3 ]; // index output[] int j = 0; for ( int i = 0; i < input.length(); i++ ) { int c = input.charAt( i ); if ( c < 0x80 ) { // 7-bits done in one byte. output[ j++ ] = ( byte ) c; } else if ( c < 0x800 ) { // 8-11 bits done in 2 bytes output[ j++ ] = ( byte ) ( 0xC0 | c >> 6 ); output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F ); } else { // 12-16 bits done in 3 bytes output[ j++ ] = ( byte ) ( 0xE0 | c >> 12 ); output[ j++ ] = ( byte ) ( 0x80 | c >> 6 & 0x3F ); output[ j++ ] = ( byte ) ( 0x80 | c & 0x3F ); } } // end for // Prune back our byte array. For efficiency we could hand item back // partly filled, which is only a minor inconvenience to the caller // most of the time to save copying the array. byte[] chopped = new byte[ j ]; System.arraycopy( output, 0, chopped, 0, j/* length */ ); return chopped; } //end encode /** * TEST harness to ensure UTF8Decoder works as advertised * * @param args not used * * @throws java.io.UnsupportedEncodingException if no support for UTF-8, not likely. */ public static void main( String[] args ) throws UnsupportedEncodingException { if ( DEBUGGING ) { String test = BOM + "Hello World" + "\u0080\u007f\u0080\u0100\u0921\u30b0\u4e70\uffff"; char[] oneOfAlmostEverything = new char[ 0xffff + 1 ]; for ( int i = 0; i <= 0xffff; i++ ) { oneOfAlmostEverything[ i ] = ( char ) i; } // avoid testing low band surrogates for ( int i = 0xdc00; i <= 0xdfff; i++ ) { oneOfAlmostEverything[ i ] = 0; } // avoid testing high band surrogates for ( int i = 0xd800; i <= 0xdbff; i++ ) { oneOfAlmostEverything[ i ] = 0; } // put one of almost every possible 16-bit Unicode character in our TEST too. test += new String( oneOfAlmostEverything ); // convert to UTF-8 with built-in Java classes. byte[] encodedByJava = test.getBytes( EIO.UTF8 ); // convert to UTF-8 with UTF8Encoder. byte[] encodedByUs = UTF8Encoder.encode( test ); boolean allOk = true; if ( encodedByUs.length != encodedByJava.length ) { out.println( "oops, different lengths" ); allOk = false; } int safe = Math.min( encodedByJava.length, encodedByUs.length ); for ( int i = 0; i < safe; i++ ) { if ( encodedByUs[ i ] != encodedByJava[ i ] ) { out.println( "oops " + encodedByJava[ i ] + "[" + Integer.toHexString( encodedByJava[ i ] ) + "] " + encodedByUs[ i ] + "[" + Integer.toHexString( encodedByUs[ i ] ) + "]" ); allOk = false; } // end if } // end for out.println( "UTF8Encoder " + ( allOk ? "worked" : "failed" ) ); } } // end main } // end UTF8Encoder