/* * [CleanESEN.java] * * Summary: Tidy up Esperanto to English file exported by Ergane. * * Copyright: (c) 2000-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.2 2008-04-06 add build to title, tidy code, fix spelling errors. */ package com.mindprod.esper; import com.mindprod.common18.EIO; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.io.StreamTokenizer; import java.util.StringTokenizer; import static java.lang.System.*; /** * Tidy up Esperanto to English file exported by Ergane. *

* Accents cleaned manually, then : changed to = manually. This * version collapses duplicates. Not part of the online program. Only has to be run once. * * @author Roedy Green, Canadian Mind Products * @version 2.2 2008-04-06 add build to title, tidy code, fix spelling errors. * @since 2000 */ public final class CleanESEN { /** * Load a properties file, but not into a Property hashTable, into an array that does not disturb property order. * Returns array of string pairs. [0=left 1=right] [wordnumber] Closes the given Reader. Property file might look * like something like this: ami=love #-30- * * @param ir where to read property pairs from * * @return pair of arrays en -> es or es -> en * @throws IOException if can't find properties */ private static String[][] loadProperties( Reader ir ) throws IOException { // make them big to start, we will shrink them later to fit. final String[] left = new String[ 20000 ]; final String[] right = new String[ 20000 ]; int count = 0; // we don't use Properties.load since that would scramble the order. final StreamTokenizer s = new StreamTokenizer( ir ); s.resetSyntax(); s.ordinaryChars( 0, 255 ); // treat space, alpha, numbers and most punctuation as ordinary char // no quotes. = is token separator. s.slashSlashComments( false ); s.slashStarComments( false ); s.wordChars( ' ', '~' ); s.commentChar( '#' ); s.whitespaceChars( '=', '=' );// ignore equal, just separates fields s.eolIsSignificant( true ); loop: while ( true ) { s.nextToken(); switch ( s.ttype ) { case StreamTokenizer.TT_EOF: break loop; case 13: case StreamTokenizer.TT_EOL: continue loop; case StreamTokenizer.TT_WORD: left[ count ] = s.sval.trim(); s.nextToken(); if ( s.ttype == StreamTokenizer.TT_WORD ) { right[ count ] = s.sval.trim(); } else { err.println( "right damage near " + count + " " + s .ttype + " " + s.toString() ); } break; default: err.println( "left damage near " + count + " " + s .ttype + " " + s.toString() ); } // end switch count++; } // end loop ir.close(); // prune back arrays to size String[][] result = new String[ 2 ][ count ]; System.arraycopy( left, 0, result[ 0 ], 0, count ); System.arraycopy( right, 0, result[ 1 ], 0, count ); return result; } // end loadProperties /** * one shot program to clean up ES-EN.TXT to a standard properties file erganevortaro.dict. * * @param args not used. */ public static void main( String[] args ) { try { final String[][] pairs = loadProperties( new FileReader( "ES-EN.TXT" ) ); final int words = pairs[ 0 ].length; err.println( "dictionary loaded with " + words + " words." ); // get rid of trailing [x], convert to lower case for ( int i = 0; i < words; i++ ) { String left = pairs[ 0 ][ i ]; final int len = left.length(); if ( len > 3 && left.charAt( len - 3 ) == '[' && left.charAt( len - 1 ) == ']' ) { left = left.substring( 0, len - 3 ); } pairs[ 0 ][ i ] = left.toLowerCase(); } // collapse duplicate entries String oldLeft = ""; int oldi = 0; for ( int i = 0; i < words; i++ ) { String left = pairs[ 0 ][ i ]; if ( left.equals( oldLeft ) ) {// collapse pairs[ 1 ][ oldi ] = pairs[ 1 ][ oldi ] + ", " + pairs[ 1 ][ i ]; pairs[ 0 ][ i ] = null; pairs[ 1 ][ i ] = null; } else { oldLeft = left; oldi = i; } } // end for // collapse duplicate phrases in definitions for ( int i = 0; i < words; i++ ) { final String right = pairs[ 1 ][ i ]; if ( right != null ) { String[] phrases = new String[ 50 ]; int phraseCount = 0; // break definition into phrases delimited by commas. StringTokenizer s = new StringTokenizer( right, "," ); while ( s.hasMoreTokens() ) { final String phrase = s.nextToken().trim(); // see if phrase already in list boolean found = false; for ( int j = 0; j < phraseCount; j++ ) { if ( phrase.equals( phrases[ j ] ) ) { found = true; break; } } // end inner for if ( !found ) { phrases[ phraseCount ] = phrase; phraseCount++; } } // end while // reconstruct the definition from the remaining constituent // phrases. final StringBuilder sb = new StringBuilder( 1000 ); for ( int j = 0; j < phraseCount; j++ ) { if ( j > 0 ) { sb.append( ", " ); } sb.append( phrases[ j ] ); } pairs[ 1 ][ i ] = sb.toString(); } } // end for err.println( "tidying complete." ); final PrintWriter pw = EIO.getPrintWriter( new File( "erganevortaro.dict" ), 64 * 1024, EIO.UTF8 ); for ( int i = 0; i < words; i++ ) { if ( pairs[ 0 ][ i ] != null ) { pw.println( pairs[ 0 ][ i ] + "=" + pairs[ 1 ][ i ] ); } } pw.close(); err.println( "dictionary saved." ); } catch ( IOException e ) { err.println( "some problem with files." ); } } // end main }