/* * [CleanENES.java] * * Summary: Tidy up English to Esperanto file exported by Ergane Accents. * * Copyright: (c) 2000-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 2.2 2008-04-06 add build to title, tidy code, fix spelling errors. */ package com.mindprod.esper; import com.mindprod.common18.EIO; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.util.Arrays; import java.util.StringTokenizer; import static java.lang.System.*; /** * Tidy up English to Esperanto file exported by Ergane Accents. *

* Cleaned manually, including é. This version collapses * duplicates. *

* read EN-ES.txt producing the cleaned up erganedict.dict. Only needs to be run once. * * @author Roedy Green, Canadian Mind Products * @version 2.2 2008-04-06 add build to title, tidy code, fix spelling errors. * @since 2000 */ public final class CleanENES { /** * load dictionary like this: ABC 1. aboco 2. aboco[1] 3. elementoj return double array [ entry number ] [0=eng * 1-esp] * * @param ir where to read the English to Esperanto word list from. * * @return dictionary as pair of arrays. * @throws IOException if can' find dictionary file */ private static String[][] loadDict( Reader ir ) throws IOException { // make them big to start, we will shrink them later to fit. BufferedReader br = new BufferedReader( ir, 4 * 1024 ); String[] left = new String[ 40000 ]; String[] right = new String[ 40000 ]; String english = null; String esperanto; int count = 0; do { String next = br.readLine(); if ( next == null ) { break; } int dot = next.indexOf( '.' ); if ( dot >= 0 ) { esperanto = next.substring( dot + 1 ).trim(); // trim trailing [1] int len = esperanto.length(); if ( len > 3 && esperanto.charAt( len - 3 ) == '[' && esperanto.charAt( len - 1 ) == ']' ) { esperanto = esperanto.substring( 0, len - 3 ); } if ( english != null ) { left[ count ] = english; right[ count ] = esperanto; count++; } } else { english = next.trim().toLowerCase(); if ( english.indexOf( ' ' ) > 0 ) { // ignore phrases english = null; } } } while ( true ); br.close(); // prune back arrays to size String[][] result = new String[ count ][ 2 ]; for ( int i = 0; i < count; i++ ) { result[ i ][ 0 ] = left[ i ]; result[ i ][ 1 ] = right[ i ]; } return result; } // end loadDict /** * Main method, converts EN-ES.TXT to tidied erganedict.dict * * @param args not used */ public static void main( String[] args ) { try { String[][] pairs = loadDict( new FileReader( "EN-ES.TXT" ) ); int words = pairs.length; err.println( "dictionary loaded with " + words + " words." ); // sort alphabetically to better catch dups Arrays.sort( pairs, new StringArrayCompare() ); // check for misordering for ( int i = 1; i < words; i++ ) { if ( pairs[ i - 1 ][ 0 ].compareTo( pairs[ i ][ 0 ] ) > 0 ) { out.println( pairs[ i - 1 ][ 0 ] + " / " + pairs[ i ][ 0 ] ); } } // collapse duplicate entries String oldLeft = ""; int oldi = 0; for ( int i = 0; i < words; i++ ) { String left = pairs[ i ][ 0 ]; if ( left.equals( oldLeft ) ) {// collapse pairs[ oldi ][ 1 ] = pairs[ oldi ][ 1 ] + ", " + pairs[ i ][ 1 ]; pairs[ i ][ 0 ] = null; pairs[ i ][ 1 ] = null; } else { oldLeft = left; oldi = i; } } // end for // collapse duplicate phrases in definitions for ( int i = 0; i < words; i++ ) { String right = pairs[ i ][ 1 ]; if ( right != null ) { String[] phrases = new String[ 50 ]; int phraseCount = 0; // break definition into phrases delimited by commas. StringTokenizer s = new StringTokenizer( right, "," ); while ( s.hasMoreTokens() ) { String phrase = s.nextToken().trim(); // see if phrase already in list boolean found = false; for ( int j = 0; j < phraseCount; j++ ) { if ( phrase.equals( phrases[ j ] ) ) { found = true; break; } } // end inner for if ( !found ) { phrases[ phraseCount ] = phrase; phraseCount++; } } // end while // reconstruct the definition from the remaining constituent // phrases. StringBuilder sb = new StringBuilder( 1000 ); for ( int j = 0; j < phraseCount; j++ ) { if ( j > 0 ) { sb.append( ", " ); } sb.append( phrases[ j ] ); } pairs[ i ][ 1 ] = sb.toString(); } } // end for err.println( "tidying complete." ); final PrintWriter pw = EIO.getPrintWriter( new File( "erganedict.dict" ), 64 * 1024, EIO.UTF8 ); for ( final String[] pair : pairs ) { if ( pair[ 0 ] != null ) { pw.println( pair[ 0 ] + "=" + pair[ 1 ] ); } } pw.close(); err.println( "dictionary saved." ); } catch ( IOException e ) { err.println( "some problem with files." ); } } // end main }