/*
* [FindAnchors.java]
*
* Summary: Find DT anchors and terms in a group of documents.
*
* Copyright: (c) 2006-2017 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.8+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.1 2006-01-19
* 1.2 2014-04-18 use
instead of
*/
package com.mindprod.qf;
import com.mindprod.common18.ST;
import com.mindprod.common18.Twirler;
import com.mindprod.entities.DeEntifyStrings;
import com.mindprod.hunkio.HunkIO;
import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.System.*;
/**
* Find DT anchors and terms in a group of documents.
*
* Parser is none too bright. Pays no attention to dl or dd nesting.
*
* Handles misc embedded HTML tags in the <dt>.. </dt>. Demands lower case tags.
*
* Searches for patterns of the form:
*
* <a name="SELFJOIN"></a>self join</dt>
* or
*
* <dt><a name="SELFJOIN">self join</a></dt>
* Invoked by IndexMindprod
*
* @author Roedy Green, Canadian Mind Products
* @version 1.2 2014-04-18 use instead of
* @since 2006
*/
final class FindAnchors
{
/**
* pattern to match inner guts of an alias comment __alias__
*/
private static final Pattern aliasPattern = Pattern.compile( "\\s*alias\\s+" );
/**
* pattern to match inner guts of an alias comment __cross__J__
*/
private static final Pattern crossPattern = Pattern.compile( "\\s*cross\\s+([JBG])\\s+" );
/**
* pattern to match inner guts of an alias comment __macro__Image__ or generated or /generated
*/
private static final Pattern imagePattern = Pattern.compile( "\\s*" +
"(macro\\s+Image|generated" +
"|/generated|cseignore|/cseignore)\\s+" );
/**
* pattern find the anchor
*/
private static final Pattern anchorPattern = Pattern.compile( "" );
/**
* rotation state of the twirling animation
*/
private static final Twirler twirler = new Twirler();
/**
* parse the big file, searching out DT anchors and terms
*
* @param big A string representing the entire file to scan.
* @param fileBeingProcessed file we are scanning, e.g. certificate.html
* @param glossaryBeingProcessed Glossary we are scanning.
*/
private static void parseAllDTsOnPage( final String big, final File fileBeingProcessed, final Gloss glossaryBeingProcessed )
{
// prune down filename to xxx.html, without target.
final String fileNameBeingProcessed = fileBeingProcessed.getName();
// do a quick look to see if there are any plausible candidates, perhaps malformed
int startLookingForNextDTAt = big.indexOf( "handle
// or
//
//
// HTMLValidator
final String anchor = d.group( 1 );
final int where1 = d.end(); // points just after dt id="HANDLE">
final int where2 = big.indexOf( "", where1 );
if ( where2 < 0 )
{
err.println( "WARNING: missing in " + fileBeingProcessed );
return;
}
startLookingForNextDTAt = where2 + "".length();
// what is inside ...
String term = big.substring( where1, where2 ).trim();
// there might be some ", commentStart + " inside in " + fileBeingProcessed );
return;
}
// see if innards of comment have the form __alias__xxxx xxxx xxxx
final String aliasCandidate = term.substring( commentStart + "
final String possibleTarget = ST.firstWord( alias );
assert IndexEntry.indexEntries.size() < IndexEntry.MAX_SIZE_INDEX_ENTRIES : "IndexEntries overflow";
if ( possibleTarget.length() >= 2 && possibleTarget.charAt( 0 ) == '#' )
{
// alias
IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed,
alias.substring( possibleTarget.length() ).trim()
/* without target */,
true
/* this is an alias */,
glossaryBeingProcessed,
fileBeingProcessed,
possibleTarget.substring( 1 )
/* without # */ ) );
}
else
{
// plain
countOfDTs++;
IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed,
alias,
true/* this is an alias */,
glossaryBeingProcessed,
fileBeingProcessed,
anchor ) );
}
}
else if ( imagePattern.matcher( aliasCandidate ).lookingAt() )
{
// ignore
}
else if ( ( m = crossPattern.matcher( aliasCandidate ) ).lookingAt() )
{
final Gloss indexToPlaceEntry = Gloss.valueOfLetter( m.group( 1 ).charAt( 0 ) );
assert indexToPlaceEntry != null : "bad gloss letter";
// string starts with the pattern
// stuff after the alias pattern is our alias.
final String alias = DeEntifyStrings.flattenHTML( aliasCandidate.substring( m.end() ).trim(),
' ' ).trim();
// was it an
final String possibleTarget = ST.firstWord( alias );
if ( possibleTarget.length() >= 2 && possibleTarget.charAt( 0 ) == '#' )
{
IndexEntry.indexEntries.add( new IndexEntry( indexToPlaceEntry /* index where entry appears */,
alias.substring( possibleTarget.length() ).trim() /* human name */,
true /* this is an alias */,
glossaryBeingProcessed /* glossary where jumps to */,
fileBeingProcessed /* file to jump to */,
possibleTarget.substring( 1 ) /* target in file to jump to without lead # */
) );
}
else
{
/* plain alias */
IndexEntry.indexEntries.add( new IndexEntry( indexToPlaceEntry /* index where entry appears */,
alias /* human name */,
true /* cross counts as an alias */,
glossaryBeingProcessed /* glossary where jumps to */,
fileBeingProcessed /* file to jump to */,
anchor /* target in file to jump to without lead # */ ) );
}
}
else
{
if ( !aliasCandidate.equals( "cseignore" ) && !aliasCandidate.equals( "/cseignore" ) )
{
err.println( "WARNING: unstructured comment inside : "
+ fileBeingProcessed
+ "\n [" + aliasCandidate + "]\n" );
// keep going processing rest of file.
}
}
// remove that comment from the term, and to prepare to look for another.
term = term.substring( 0, commentStart ) + term.substring( commentEnd + "-->".length() );
} // end while
// get rid of remaining etc.
term = DeEntifyStrings.flattenHTML( term, ' ' ).trim();
// capture the primary anchor/term pair.
countOfDTs++;
IndexEntry.indexEntries.add( new IndexEntry( glossaryBeingProcessed,
term,
false/* this is not an alias */,
glossaryBeingProcessed,
fileBeingProcessed, anchor ) );
// we are positioned past ready to look for more s.
} // end while
if ( countOfDTs == 0 )
{
err.println( "WARNING: No valid
* unsorted
*
* F = filename (where anchor was found ) e.g. F mglosssixteenbit.html
*
* U = anchor (associated anchor) e.g. U SIXTEENBIT
*
* N = term/name (term defined in English) e.g. N 16-bit
*
* is represented by & in *.raw files)
*
* @param s Customiser for the glossary we are to prepare the index for
*/
public static void findAnchorsInGlossary( AbstractGlossCustomiser s )
{
try
{
// where to look for files with defined terms
String inDir = s.getInputDirName();
// read entire html file into RAM in one fell swoop.
String prevFilename = null;
String inFilename = s.getNextInFilename();
while ( inFilename != null && !inFilename.equals( prevFilename ) )
{
prevFilename = inFilename;
// e.g. certificate.html no lead E:\mindprod\jgloss
final File fileBeingProcessed = new File( inDir, inFilename );
final String big = HunkIO.readEntireFile( fileBeingProcessed, HunkIO.UTF8 );
try
{
twirler.twirl( out );
parseAllDTsOnPage( big, fileBeingProcessed, s.getGlossEnum() );
}
catch ( Exception e )
{
// give up entirely on first error,
err.println( "\n"
+ inFilename
+ " "
+ e.getMessage() );
System.exit( 1 );
}
inFilename = s.getNextInFilename();
} // end while
}
catch ( IOException e )
{
err.println();
e.printStackTrace( err );
err.println();
System.exit( 1 );
}
}
}