gate.html
Class NekoHtmlDocumentHandler

java.lang.Object
  extended by gate.html.NekoHtmlDocumentHandler
All Implemented Interfaces:
org.apache.xerces.xni.parser.XMLErrorHandler, org.apache.xerces.xni.XMLDocumentHandler

public class NekoHtmlDocumentHandler
extends Object
implements org.apache.xerces.xni.XMLDocumentHandler, org.apache.xerces.xni.parser.XMLErrorHandler

The XNI document handler used with NekoHTML to parse HTML documents. We use XNI rather than SAX as XNI can distinguish between empty elements (<element/>) and elements with an empty span (<element></element>), whereas SAX just treats both cases the same.


Nested Class Summary
(package private)  class NekoHtmlDocumentHandler.CustomObject
          The objects belonging to this class are used inside the stack.
 
Field Summary
protected  boolean addSpaceOnUnpack
          Initialised from the user config, stores whether to add extra space characters to separate words that would otherwise be run together, e.g. "...foo</td><td>bar...".
private  RepositioningInfo ampCodingInfo
          Keep the refference to this structure
static String AUGMENTATIONS
           
private  AnnotationSet basicAS
           
private  int charactersStartOffset
          The start offset of the current block of character content.
private  LinkedList<NekoHtmlDocumentHandler.CustomObject> colector
           
private  StringBuilder contentBuffer
          This is used to capture all data within two tags before calling the actual characters method
protected  int customObjectsId
           
private static boolean DEBUG
           
private static boolean DEBUG_CHARACTERS
           
private static boolean DEBUG_ELEMENTS
           
private static boolean DEBUG_GENERAL
           
private static boolean DEBUG_UNUSED
           
private  Document doc
           
private  int elements
           
(package private) static int ELEMENTS_RATE
           
(package private)  int ignorableTagLevels
           
private  Set<String> ignorableTags
          The HTML tag names (lower case) whose text content should be ignored completely by this handler.
private  int[] lineOffsets
          Array holding the character offset of the start of each line in the document.
protected  List<StatusListener> myStatusListeners
           
private static Comparator<Object> POSITION_INFO_COMPARATOR
          A comparator that compares two RepositioningInfo.PositionInfo records by their originalPosition values.
protected  boolean previousChunkEndedWithWS
          During parsing, keeps track of whether the previous chunk of character data ended with a whitespace character.
private  boolean readCharacterStatus
          This is a variable that shows if characters have been read
private  RepositioningInfo reposInfo
          Keep the refference to this structure
private  Stack<NekoHtmlDocumentHandler.CustomObject> stack
           
private  StringBuilder tmpDocContent
           
 
Constructor Summary
NekoHtmlDocumentHandler(Document aDocument, AnnotationSet anAnnotationSet, Set<String> ignorableTags)
          Constructor initialises all the private memeber data
 
Method Summary
 void addRepositioningInfo(int contentLength, int pos, int extractedPos)
          For given content the list with shrink position information is searched and on the corresponding positions the correct repositioning information is calculated and generated.
 void addStatusListener(StatusListener listener)
           
 void characters(org.apache.xerces.xni.XMLString text, org.apache.xerces.xni.Augmentations augs)
          Called when the parser encounters character or CDATA content.
 void charactersAction()
          Called when all text between two tags has been processed.
 void comment(org.apache.xerces.xni.XMLString content, org.apache.xerces.xni.Augmentations augs)
           
protected  void customizeAppearanceOfDocumentWithEndTag(String tagName)
          This method analizes the tag t and adds some \n chars and spaces to the tmpDocContent.The reason behind is that we need to have a readable form for the final document.
protected  void customizeAppearanceOfDocumentWithStartTag(String tagName)
          This method analizes the tag t and adds some \n chars and spaces to the tmpDocContent.The reason behind is that we need to have a readable form for the final document.
 void doctypeDecl(String arg0, String arg1, String arg2, org.apache.xerces.xni.Augmentations arg3)
           
 void emptyElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs)
          Called to signal an empty element.
 void endCDATA(org.apache.xerces.xni.Augmentations augs)
           
 void endDocument(org.apache.xerces.xni.Augmentations augs)
          Called when the parser reaches the end of the document.
 void endElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.Augmentations augs)
          Called when the parser encounters the end of an element.
 void endElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.Augmentations augs, boolean wasEmptyElement)
          Called when the parser encounters the end of an HTML element.
 void endGeneralEntity(String arg0, org.apache.xerces.xni.Augmentations arg1)
           
 void error(String domain, String key, org.apache.xerces.xni.parser.XMLParseException e)
          Non-fatal error, print the stack trace but continue processing.
 void fatalError(String domain, String key, org.apache.xerces.xni.parser.XMLParseException e)
           
protected  void fireStatusChangedEvent(String text)
           
private  long fixStartOffsetForWhitespace(long wsOffset)
          Correct for whitespace.
 RepositioningInfo getAmpCodingInfo()
          Return current RepositioningInfo object for ampersand coding.
 int getCustomObjectsId()
           
 org.apache.xerces.xni.parser.XMLDocumentSource getDocumentSource()
           
 Set<String> getIgnorableTags()
          Get the set of tag names whose content is ignored by this handler.
 RepositioningInfo getRepositioningInfo()
          Return current RepositioningInfo object
 void ignorableWhitespace(org.apache.xerces.xni.XMLString arg0, org.apache.xerces.xni.Augmentations arg1)
           
 void processingInstruction(String target, org.apache.xerces.xni.XMLString data, org.apache.xerces.xni.Augmentations augs)
           
 void removeStatusListener(StatusListener listener)
           
 void setAmpCodingInfo(RepositioningInfo info)
          Set repositioning information structure refference for ampersand coding.
 void setDocumentSource(org.apache.xerces.xni.parser.XMLDocumentSource arg0)
           
 void setIgnorableTags(Set<String> newTags)
          Set the set of tag names whose text content will be ignored.
 void setLineOffsets(int[] lineOffsets)
          Set the array of line offsets.
 void setRepositioningInfo(RepositioningInfo info)
          Set repositioning information structure refference.
 void startCDATA(org.apache.xerces.xni.Augmentations augs)
           
 void startDocument(org.apache.xerces.xni.XMLLocator arg0, String arg1, org.apache.xerces.xni.NamespaceContext arg2, org.apache.xerces.xni.Augmentations arg3)
           
 void startElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs)
          Called when the parser encounters the start of an HTML element.
 void startGeneralEntity(String arg0, org.apache.xerces.xni.XMLResourceIdentifier arg1, String arg2, org.apache.xerces.xni.Augmentations arg3)
           
 void textDecl(String arg0, String arg1, org.apache.xerces.xni.Augmentations arg2)
           
 void warning(String arg0, String arg1, org.apache.xerces.xni.parser.XMLParseException arg2)
           
 void xmlDecl(String arg0, String arg1, String arg2, org.apache.xerces.xni.Augmentations arg3)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

DEBUG

private static final boolean DEBUG
See Also:
Constant Field Values

DEBUG_GENERAL

private static final boolean DEBUG_GENERAL
See Also:
Constant Field Values

DEBUG_ELEMENTS

private static final boolean DEBUG_ELEMENTS
See Also:
Constant Field Values

DEBUG_CHARACTERS

private static final boolean DEBUG_CHARACTERS
See Also:
Constant Field Values

DEBUG_UNUSED

private static final boolean DEBUG_UNUSED
See Also:
Constant Field Values

AUGMENTATIONS

public static final String AUGMENTATIONS
See Also:
Constant Field Values

POSITION_INFO_COMPARATOR

private static final Comparator<Object> POSITION_INFO_COMPARATOR
A comparator that compares two RepositioningInfo.PositionInfo records by their originalPosition values. It also supports either or both argument being a Long, in which case the Long value is used directly. This allows you to binarySearch for an offset rather than having to construct a PositionInfo record with the target value.


reposInfo

private RepositioningInfo reposInfo
Keep the refference to this structure


ampCodingInfo

private RepositioningInfo ampCodingInfo
Keep the refference to this structure


ignorableTags

private Set<String> ignorableTags
The HTML tag names (lower case) whose text content should be ignored completely by this handler. Typically this is just script and style tags.


ignorableTagLevels

int ignorableTagLevels

ELEMENTS_RATE

static final int ELEMENTS_RATE
See Also:
Constant Field Values

lineOffsets

private int[] lineOffsets
Array holding the character offset of the start of each line in the document.


tmpDocContent

private StringBuilder tmpDocContent

contentBuffer

private StringBuilder contentBuffer
This is used to capture all data within two tags before calling the actual characters method


readCharacterStatus

private boolean readCharacterStatus
This is a variable that shows if characters have been read


charactersStartOffset

private int charactersStartOffset
The start offset of the current block of character content.


stack

private Stack<NekoHtmlDocumentHandler.CustomObject> stack

doc

private Document doc

basicAS

private AnnotationSet basicAS

myStatusListeners

protected List<StatusListener> myStatusListeners

elements

private int elements

customObjectsId

protected int customObjectsId

colector

private LinkedList<NekoHtmlDocumentHandler.CustomObject> colector

addSpaceOnUnpack

protected boolean addSpaceOnUnpack
Initialised from the user config, stores whether to add extra space characters to separate words that would otherwise be run together, e.g. "...foo</td><td>bar...". If true, this becomes "foo bar", if false it is "foobar".


previousChunkEndedWithWS

protected boolean previousChunkEndedWithWS
During parsing, keeps track of whether the previous chunk of character data ended with a whitespace character.

Constructor Detail

NekoHtmlDocumentHandler

public NekoHtmlDocumentHandler(Document aDocument,
                               AnnotationSet anAnnotationSet,
                               Set<String> ignorableTags)
Constructor initialises all the private memeber data

Parameters:
aDocument - The gate document that will be processed
anAnnotationSet - The annotation set that will contain annotations resulted from the processing of the gate document
ignorableTags - HTML tag names (lower case) whose text content should be ignored by this handler.
Method Detail

setLineOffsets

public void setLineOffsets(int[] lineOffsets)
Set the array of line offsets. This array holds the starting character offset in the document of the beginning of each line of text, to allow us to convert the NekoHTML location information (line and column number) into offsets from the beginning of the document for repositioning info.


startElement

public void startElement(org.apache.xerces.xni.QName element,
                         org.apache.xerces.xni.XMLAttributes attributes,
                         org.apache.xerces.xni.Augmentations augs)
                  throws org.apache.xerces.xni.XNIException
Called when the parser encounters the start of an HTML element. Empty elements also trigger this method, followed immediately by an endElement(org.apache.xerces.xni.QName, org.apache.xerces.xni.Augmentations).

Specified by:
startElement in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

characters

public void characters(org.apache.xerces.xni.XMLString text,
                       org.apache.xerces.xni.Augmentations augs)
                throws org.apache.xerces.xni.XNIException
Called when the parser encounters character or CDATA content. Characters may be reported in more than one chunk, so we gather all contiguous chunks together and process them in one block.

Specified by:
characters in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

charactersAction

public void charactersAction()
                      throws org.apache.xerces.xni.XNIException
Called when all text between two tags has been processed.

Throws:
org.apache.xerces.xni.XNIException

endElement

public void endElement(org.apache.xerces.xni.QName element,
                       org.apache.xerces.xni.Augmentations augs)
                throws org.apache.xerces.xni.XNIException
Called when the parser encounters the end of an element.

Specified by:
endElement in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

emptyElement

public void emptyElement(org.apache.xerces.xni.QName element,
                         org.apache.xerces.xni.XMLAttributes attributes,
                         org.apache.xerces.xni.Augmentations augs)
                  throws org.apache.xerces.xni.XNIException
Called to signal an empty element. This simply synthesizes a startElement followed by an endElement event.

Specified by:
emptyElement in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

endElement

public void endElement(org.apache.xerces.xni.QName element,
                       org.apache.xerces.xni.Augmentations augs,
                       boolean wasEmptyElement)
                throws org.apache.xerces.xni.XNIException
Called when the parser encounters the end of an HTML element.

Throws:
org.apache.xerces.xni.XNIException

endDocument

public void endDocument(org.apache.xerces.xni.Augmentations augs)
                 throws org.apache.xerces.xni.XNIException
Called when the parser reaches the end of the document. Here we store the new content and construct the Original markups annotations.

Specified by:
endDocument in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

error

public void error(String domain,
                  String key,
                  org.apache.xerces.xni.parser.XMLParseException e)
Non-fatal error, print the stack trace but continue processing.

Specified by:
error in interface org.apache.xerces.xni.parser.XMLErrorHandler

fatalError

public void fatalError(String domain,
                       String key,
                       org.apache.xerces.xni.parser.XMLParseException e)
                throws org.apache.xerces.xni.XNIException
Specified by:
fatalError in interface org.apache.xerces.xni.parser.XMLErrorHandler
Throws:
org.apache.xerces.xni.XNIException

processingInstruction

public void processingInstruction(String target,
                                  org.apache.xerces.xni.XMLString data,
                                  org.apache.xerces.xni.Augmentations augs)
                           throws org.apache.xerces.xni.XNIException
Specified by:
processingInstruction in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

comment

public void comment(org.apache.xerces.xni.XMLString content,
                    org.apache.xerces.xni.Augmentations augs)
             throws org.apache.xerces.xni.XNIException
Specified by:
comment in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

startCDATA

public void startCDATA(org.apache.xerces.xni.Augmentations augs)
                throws org.apache.xerces.xni.XNIException
Specified by:
startCDATA in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

endCDATA

public void endCDATA(org.apache.xerces.xni.Augmentations augs)
              throws org.apache.xerces.xni.XNIException
Specified by:
endCDATA in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

fixStartOffsetForWhitespace

private long fixStartOffsetForWhitespace(long wsOffset)
Correct for whitespace. Given the offset of the start of a block of whitespace in the original content, this method calculates the offset of the first following non-whitespace character. If wsOffset points to the start of a run of whitespace then there will be a PositionInfo record in the ampCodingInfo that represents this run of whitespace, from which we can find the end of the run. If there is no PositionInfo record for this offset then it must point to a single whitespace character, so we simply return wsOffset+1.


addRepositioningInfo

public void addRepositioningInfo(int contentLength,
                                 int pos,
                                 int extractedPos)
For given content the list with shrink position information is searched and on the corresponding positions the correct repositioning information is calculated and generated.


customizeAppearanceOfDocumentWithStartTag

protected void customizeAppearanceOfDocumentWithStartTag(String tagName)
This method analizes the tag t and adds some \n chars and spaces to the tmpDocContent.The reason behind is that we need to have a readable form for the final document. This method modifies the content of tmpDocContent.

Parameters:
t - the Html tag encounted by the HTML parser

customizeAppearanceOfDocumentWithEndTag

protected void customizeAppearanceOfDocumentWithEndTag(String tagName)
This method analizes the tag t and adds some \n chars and spaces to the tmpDocContent.The reason behind is that we need to have a readable form for the final document. This method modifies the content of tmpDocContent.

Parameters:
t - the Html tag encounted by the HTML parser

setRepositioningInfo

public void setRepositioningInfo(RepositioningInfo info)
Set repositioning information structure refference. If you set this refference to null information wouldn't be collected.


getRepositioningInfo

public RepositioningInfo getRepositioningInfo()
Return current RepositioningInfo object


setAmpCodingInfo

public void setAmpCodingInfo(RepositioningInfo info)
Set repositioning information structure refference for ampersand coding. If you set this refference to null information wouldn't be used.


getAmpCodingInfo

public RepositioningInfo getAmpCodingInfo()
Return current RepositioningInfo object for ampersand coding.


setIgnorableTags

public void setIgnorableTags(Set<String> newTags)
Set the set of tag names whose text content will be ignored.

Parameters:
newTags - a set of lower-case tag names

getIgnorableTags

public Set<String> getIgnorableTags()
Get the set of tag names whose content is ignored by this handler.


getCustomObjectsId

public int getCustomObjectsId()

addStatusListener

public void addStatusListener(StatusListener listener)

removeStatusListener

public void removeStatusListener(StatusListener listener)

fireStatusChangedEvent

protected void fireStatusChangedEvent(String text)

doctypeDecl

public void doctypeDecl(String arg0,
                        String arg1,
                        String arg2,
                        org.apache.xerces.xni.Augmentations arg3)
                 throws org.apache.xerces.xni.XNIException
Specified by:
doctypeDecl in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

endGeneralEntity

public void endGeneralEntity(String arg0,
                             org.apache.xerces.xni.Augmentations arg1)
                      throws org.apache.xerces.xni.XNIException
Specified by:
endGeneralEntity in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

getDocumentSource

public org.apache.xerces.xni.parser.XMLDocumentSource getDocumentSource()
Specified by:
getDocumentSource in interface org.apache.xerces.xni.XMLDocumentHandler

ignorableWhitespace

public void ignorableWhitespace(org.apache.xerces.xni.XMLString arg0,
                                org.apache.xerces.xni.Augmentations arg1)
                         throws org.apache.xerces.xni.XNIException
Specified by:
ignorableWhitespace in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

setDocumentSource

public void setDocumentSource(org.apache.xerces.xni.parser.XMLDocumentSource arg0)
Specified by:
setDocumentSource in interface org.apache.xerces.xni.XMLDocumentHandler

startDocument

public void startDocument(org.apache.xerces.xni.XMLLocator arg0,
                          String arg1,
                          org.apache.xerces.xni.NamespaceContext arg2,
                          org.apache.xerces.xni.Augmentations arg3)
                   throws org.apache.xerces.xni.XNIException
Specified by:
startDocument in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

startGeneralEntity

public void startGeneralEntity(String arg0,
                               org.apache.xerces.xni.XMLResourceIdentifier arg1,
                               String arg2,
                               org.apache.xerces.xni.Augmentations arg3)
                        throws org.apache.xerces.xni.XNIException
Specified by:
startGeneralEntity in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

textDecl

public void textDecl(String arg0,
                     String arg1,
                     org.apache.xerces.xni.Augmentations arg2)
              throws org.apache.xerces.xni.XNIException
Specified by:
textDecl in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

xmlDecl

public void xmlDecl(String arg0,
                    String arg1,
                    String arg2,
                    org.apache.xerces.xni.Augmentations arg3)
             throws org.apache.xerces.xni.XNIException
Specified by:
xmlDecl in interface org.apache.xerces.xni.XMLDocumentHandler
Throws:
org.apache.xerces.xni.XNIException

warning

public void warning(String arg0,
                    String arg1,
                    org.apache.xerces.xni.parser.XMLParseException arg2)
             throws org.apache.xerces.xni.XNIException
Specified by:
warning in interface org.apache.xerces.xni.parser.XMLErrorHandler
Throws:
org.apache.xerces.xni.XNIException