/* * [FindAnchors.java] * * Summary: Find DT anchors and terms in a group of documents. * * Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.1 2006-01-19 * 1.2 2014-04-18 use
instead of
*/ package com.mindprod.qf; import com.mindprod.common18.ST; import com.mindprod.common18.Twirler; import com.mindprod.entities.DeEntifyStrings; import com.mindprod.hunkio.HunkIO; import java.io.File; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Find DT anchors and terms in a group of documents. *

* Parser is none too bright. Pays no attention to dl or dd nesting. *

* Handles misc embedded HTML tags in the <dt>.. </dt>. Demands lower case tags. *

* Searches for patterns of the form: *

* <a name="SELFJOIN"></a>self join</dt> * or *

* <dt><a name="SELFJOIN">self join</a></dt> * Invoked by IndexMindprod * * @author Roedy Green, Canadian Mind Products * @version 1.2 2014-04-18 use

instead of
* @since 2006 */ final class FindAnchors { /** * pattern to match inner guts of an alias comment __alias__ */ private static final Pattern aliasPattern = Pattern.compile( "\\s*alias\\s+" ); /** * pattern to match inner guts of an alias comment __cross__J__ */ private static final Pattern crossPattern = Pattern.compile( "\\s*cross\\s+([JBG])\\s+" ); /** * pattern to match inner guts of an alias comment __macro__Image__ or generated or /generated */ private static final Pattern imagePattern = Pattern.compile( "\\s*" + "(macro\\s+Image|generated" + "|/generated|cseignore|/cseignore)\\s+" ); /** * pattern find the anchor
*/ private static final Pattern anchorPattern = Pattern.compile( "" ); /** * rotation state of the twirling animation */ private static final Twirler twirler = new Twirler(); /** * parse the big file, searching out DT anchors and terms * * @param big A string representing the entire file to scan. * @param fileBeingProcessed file we are scanning, e.g. certificate.html * @param glossaryBeingProcessed Glossary we are scanning. */ private static void parseAllDTsOnPage( final String big, final File fileBeingProcessed, final Gloss glossaryBeingProcessed ) { // prune down filename to xxx.html, without target. final String fileNameBeingProcessed = fileBeingProcessed.getName(); // do a quick look to see if there are any plausible candidates, perhaps malformed int startLookingForNextDTAt = big.indexOf( "handle
// or //
// get htmlvalidtor // HTMLValidator
final String anchor = d.group( 1 ); final int where1 = d.end(); // points just after dt id="HANDLE"> final int where2 = big.indexOf( "", where1 ); if ( where2 < 0 ) { err.println( "WARNING: missing in " + fileBeingProcessed ); return; } startLookingForNextDTAt = where2 + "".length(); // what is inside ... String term = big.substring( where1, where2 ).trim(); // there might be some ", commentStart + " inside
in " + fileBeingProcessed ); return; } // see if innards of comment have the form __alias__xxxx xxxx xxxx final String aliasCandidate = term.substring( commentStart + " final String possibleTarget = ST.firstWord( alias ); assert IndexEntry.indexEntries.size() < IndexEntry.MAX_SIZE_INDEX_ENTRIES : "IndexEntries overflow"; if ( possibleTarget.length() >= 2 && possibleTarget.charAt( 0 ) == '#' ) { // alias IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed, alias.substring( possibleTarget.length() ).trim() /* without target */, true /* this is an alias */, glossaryBeingProcessed, fileBeingProcessed, possibleTarget.substring( 1 ) /* without # */ ) ); } else { // plain countOfDTs++; IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed, alias, true/* this is an alias */, glossaryBeingProcessed, fileBeingProcessed, anchor ) ); } } else if ( imagePattern.matcher( aliasCandidate ).lookingAt() ) { // ignore } else if ( ( m = crossPattern.matcher( aliasCandidate ) ).lookingAt() ) { final Gloss indexToPlaceEntry = Gloss.valueOfLetter( m.group( 1 ).charAt( 0 ) ); assert indexToPlaceEntry != null : "bad gloss letter"; // string starts with the pattern // stuff after the alias pattern is our alias. final String alias = DeEntifyStrings.flattenHTML( aliasCandidate.substring( m.end() ).trim(), ' ' ).trim(); // was it an final String possibleTarget = ST.firstWord( alias ); if ( possibleTarget.length() >= 2 && possibleTarget.charAt( 0 ) == '#' ) { IndexEntry.indexEntries.add( new IndexEntry( indexToPlaceEntry /* index where entry appears */, alias.substring( possibleTarget.length() ).trim() /* human name */, true /* this is an alias */, glossaryBeingProcessed /* glossary where jumps to */, fileBeingProcessed /* file to jump to */, possibleTarget.substring( 1 ) /* target in file to jump to without lead # */ ) ); } else { /* plain alias */ IndexEntry.indexEntries.add( new IndexEntry( indexToPlaceEntry /* index where entry appears */, alias /* human name */, true /* cross counts as an alias */, glossaryBeingProcessed /* glossary where jumps to */, fileBeingProcessed /* file to jump to */, anchor /* target in file to jump to without lead # */ ) ); } } else { if ( !aliasCandidate.equals( "cseignore" ) && !aliasCandidate.equals( "/cseignore" ) ) { err.println( "WARNING: unstructured comment inside
: " + fileBeingProcessed + "\n [" + aliasCandidate + "]\n" ); // keep going processing rest of file. } } // remove that comment from the term, and to prepare to look for another. term = term.substring( 0, commentStart ) + term.substring( commentEnd + "-->".length() ); } // end while // get rid of remaining etc. term = DeEntifyStrings.flattenHTML( term, ' ' ).trim(); // capture the primary anchor/term pair. countOfDTs++; IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed, term, false/* this is not an alias */, glossaryBeingProcessed, fileBeingProcessed, anchor ) ); // we are positioned past
ready to look for more
s. } // end while if ( countOfDTs == 0 ) { err.println( "WARNING: No valid
* unsorted *

* F = filename (where anchor was found ) e.g. F mglosssixteenbit.html *

* U = anchor (associated anchor) e.g. U SIXTEENBIT *

* N = term/name (term defined in English) e.g. N 16-bit *

* is represented by & in *.raw files) * * @param s Customiser for the glossary we are to prepare the index for */ public static void findAnchorsInGlossary( AbstractGlossCustomiser s ) { try { // where to look for files with defined terms String inDir = s.getInputDirName(); // read entire html file into RAM in one fell swoop. String prevFilename = null; String inFilename = s.getNextInFilename(); while ( inFilename != null && !inFilename.equals( prevFilename ) ) { prevFilename = inFilename; // e.g. certificate.html no lead E:\mindprod\jgloss final File fileBeingProcessed = new File( inDir, inFilename ); final String big = HunkIO.readEntireFile( fileBeingProcessed, HunkIO.UTF8 ); try { twirler.twirl( out ); parseAllDTsOnPage( big, fileBeingProcessed, s.getGlossEnum() ); } catch ( Exception e ) { // give up entirely on first error, err.println( "\n" + inFilename + " " + e.getMessage() ); System.exit( 1 ); } inFilename = s.getNextInFilename(); } // end while } catch ( IOException e ) { err.println(); e.printStackTrace( err ); err.println(); System.exit( 1 ); } } }