/* * [TestAutocorrects.java] * * Summary: read MS Word autocorrects file. * * Copyright: (c) 2010-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2010-03-01 */ package com.mindprod.test; import com.mindprod.common18.EIO; import com.mindprod.common18.ST; import java.io.DataInputStream; import java.io.EOFException; import java.io.File; import java.io.IOException; import static java.lang.System.*; /** * read MS Word autocorrects file. *

* Outstanding puzzles: * 1. what is the significance of the 0x2000 and 0x2100 spacers? * 2. what are the bits in the mystery region used for? * 3. How is the language encoded if it indeed is? * 4. Precisely how do all the options map onto bits? *

* * @author Roedy Green, Canadian Mind Products * @version 1.0 2006-03-24 * @noinspection WeakerAccess * @since 2006-03-24 */ public class TestAutocorrects { /** * table to convert a nibble to a hex char. */ static final char[] hexChar = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String AMERICAN_AUTOCORRECTS = "C:/Users/Roedy/AppData/roaming/Microsoft/Office/MSO1033.acl"; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String BRITISH_AUTOCORRECTS = "C:/Users/Roedy/AppData/roaming/Microsoft/Office/MSO2057.acl"; @SuppressWarnings( { "UnusedDeclaration" } ) private static final String FRENCH_AUTOCORRECTS = "F:\\Program Files (x86)\\Microsoft Office\\Office\\1036\\mso" + ".acl"; public static void main( String[] args ) throws IOException { // O P E N final DataInputStream dis = EIO.getDataInputStream( new File( "C:/temp/MSO1033.bak" ), 64 * 1024 ); int offset = 0; // 4 bytes : signature 04 01 96 00 int signature = dis.readInt(); out.println( "signature: " + ST.toLZHexString( signature, 8 ) ); offset += 4; // 4 bytes : option e.g. 22 c0 ef 05 int options = dis.readInt(); out.println( "options: " + ST.toLZHexString( options, 8 ) ); offset += 4; // 4 bytes: little endian length of file in bytes int fileLength = Integer.reverseBytes( dis.readInt() ); out.println( "file length: " + fileLength + " : " + ST.toLZHexString( fileLength, 4 ) ); offset += 4; // 5 bytes mystery e.g 93 03 00 00 b9 or 9f 03 00 00 53 byte[] mystery = new byte[ 5 ]; if ( dis.read( mystery ) != 5 ) { throw new IllegalArgumentException( "trouble reading mystery bytes" ); } out.println( "mystery: " + toHexString( mystery ) ); offset += 5; int count = 0; try { outer: while ( true ) { out.print( ST.toLZHexString( offset, 4 ) ); // bypass possibly multiple spacer 0s. usually 1. // Why bother with spacers? probably originally to make the file more acceptable to C++ // which likes 0-terminators on its strings. // Why variable numbers of them? Probably just to mess with the minds of // people attempting to export the data. int lena = 0; int spacers = 0; while ( lena == 0 || lena == 0x2000 || lena == 0x2100 ) { // read length or spacer lena = dis.readShort(); offset += 2; if ( lena == 0 ) { spacers++; if ( spacers >= 2 ) { // end marked by two 16-bit zeros in a row break outer; } } } // read abbreviation UTF-16BE if ( !( 1 <= lena && lena <= 1024 ) ) { throw new IllegalArgumentException( "corrupt file abbreviation length: " + lena ); } final char[] abbr = new char[ lena ]; for ( int i = 0; i < lena; i++ ) { abbr[ i ] = dis.readChar(); } offset += lena * 2; // bypass possibly multiple spacer 0s. Usually 1. int lene = 0; while ( lene == 0 || lene == 0x2000 || lene == 0x2100 ) { // read length or spacer lene = dis.readShort(); offset += 2; } // read expansion UTF-16BE if ( !( 1 <= lene && lene <= 1024 ) ) { throw new IllegalArgumentException( "corrupt file expansion length: " + lene ); } final char[] expansion = new char[ lene ]; for ( int i = 0; i < lene; i++ ) { expansion[ i ] = dis.readChar(); } offset += lene * 2; count++; out.println( " " + count + " " + new String( abbr ) + " --> " + new String( expansion ) ); } } catch ( EOFException e ) { dis.close(); } } /** * Fast convert a byte array to a hex string * with possible leading zero. * * @param bs array of bytes to convert to string * * @return hex representation, two chars per byte. */ public static String toHexString( byte[] bs ) { StringBuilder sb = new StringBuilder( bs.length * 2 ); for ( byte b : bs ) { // look up high nibble char sb.append( hexChar[ ( b & 0xf0 ) >>> 4 ] ); // look up low nibble char sb.append( hexChar[ b & 0x0f ] ); } return sb.toString(); } }