0001 /*
0002 * NekoHtmlDocumentHandler.java
0003 *
0004 * Copyright (c) 2006, The University of Sheffield.
0005 *
0006 * This file is part of GATE (see http://gate.ac.uk/), and is free
0007 * software, licenced under the GNU Library General Public License,
0008 * Version 2, June 1991 (in the distribution as file licence.html,
0009 * and also available at http://gate.ac.uk/gate/licence.html).
0010 *
0011 * Ian Roberts, 17/Dec/2006
0012 *
0013 * $Id: NekoHtmlDocumentHandler.java 9795 2008-08-06 09:43:18Z ian_roberts $
0014 */
0015
0016 package gate.html;
0017
0018 import gate.Factory;
0019 import gate.FeatureMap;
0020 import gate.Gate;
0021 import gate.GateConstants;
0022 import gate.corpora.DocumentContentImpl;
0023 import gate.corpora.RepositioningInfo;
0024 import gate.event.StatusListener;
0025 import gate.util.Err;
0026 import gate.util.InvalidOffsetException;
0027 import gate.util.Out;
0028
0029 import java.util.Collections;
0030 import java.util.Comparator;
0031 import java.util.HashSet;
0032 import java.util.Iterator;
0033 import java.util.LinkedList;
0034 import java.util.List;
0035 import java.util.Set;
0036
0037 import org.apache.xerces.xni.Augmentations;
0038 import org.apache.xerces.xni.NamespaceContext;
0039 import org.apache.xerces.xni.QName;
0040 import org.apache.xerces.xni.XMLAttributes;
0041 import org.apache.xerces.xni.XMLLocator;
0042 import org.apache.xerces.xni.XMLResourceIdentifier;
0043 import org.apache.xerces.xni.XMLString;
0044 import org.apache.xerces.xni.XNIException;
0045 import org.apache.xerces.xni.parser.XMLDocumentSource;
0046 import org.apache.xerces.xni.parser.XMLParseException;
0047 import org.cyberneko.html.HTMLEventInfo;
0048
0049 /**
0050 * The XNI document handler used with NekoHTML to parse HTML documents.
0051 * We use XNI rather than SAX as XNI can distinguish between empty
0052 * elements (<element/>) and elements with an empty span
0053 * (<element></element>), whereas SAX just treats both cases
0054 * the same.
0055 */
0056 public class NekoHtmlDocumentHandler
0057 implements
0058 org.apache.xerces.xni.XMLDocumentHandler,
0059 org.apache.xerces.xni.parser.XMLErrorHandler {
0060 private static final boolean DEBUG = false;
0061
0062 private static final boolean DEBUG_GENERAL = DEBUG;
0063
0064 private static final boolean DEBUG_ELEMENTS = DEBUG;
0065
0066 private static final boolean DEBUG_CHARACTERS = DEBUG;
0067
0068 private static final boolean DEBUG_UNUSED = DEBUG;
0069
0070 public static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
0071
0072 /**
0073 * Constructor initialises all the private memeber data
0074 *
0075 * @param aDocument The gate document that will be processed
0076 * @param anAnnotationSet The annotation set that will contain
0077 * annotations resulted from the processing of the gate
0078 * document
0079 * @param ignorableTags HTML tag names (lower case) whose text content
0080 * should be ignored by this handler.
0081 */
0082 public NekoHtmlDocumentHandler(gate.Document aDocument,
0083 gate.AnnotationSet anAnnotationSet, Set<String> ignorableTags) {
0084 if(ignorableTags == null) {
0085 ignorableTags = new HashSet<String>();
0086 }
0087 if(DEBUG_GENERAL) {
0088 Out.println("Created NekoHtmlDocumentHandler. ignorableTags = "
0089 + ignorableTags);
0090 }
0091 // init stack
0092 stack = new java.util.Stack<CustomObject>();
0093
0094 // this string contains the plain text (the text without markup)
0095 tmpDocContent = new StringBuilder(aDocument.getContent().size().intValue());
0096
0097 // colector is used later to transform all custom objects into
0098 // annotation objects
0099 colector = new LinkedList<CustomObject>();
0100
0101 // the Gate document
0102 doc = aDocument;
0103
0104 // init an annotation set for this gate document
0105 basicAS = anAnnotationSet;
0106
0107 // first annotation ID to use
0108 customObjectsId = 0;
0109
0110 this.ignorableTags = ignorableTags;
0111
0112 if ( Gate.getUserConfig().get(
0113 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)!= null) {
0114 addSpaceOnUnpack =
0115 Gate.getUserConfig().getBoolean(
0116 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME
0117 ).booleanValue();
0118 }
0119 }// HtmlDocumentHandler
0120
0121 /**
0122 * Set the array of line offsets. This array holds the starting
0123 * character offset in the document of the beginning of each line of
0124 * text, to allow us to convert the NekoHTML location information
0125 * (line and column number) into offsets from the beginning of the
0126 * document for repositioning info.
0127 */
0128 public void setLineOffsets(int[] lineOffsets) {
0129 this.lineOffsets = lineOffsets;
0130 }
0131
0132 /**
0133 * Called when the parser encounters the start of an HTML element.
0134 * Empty elements also trigger this method, followed immediately by an
0135 * {@link #endElement}.
0136 */
0137 public void startElement(QName element, XMLAttributes attributes,
0138 Augmentations augs) throws XNIException {
0139 // deal with any outstanding character content
0140 charactersAction();
0141
0142 if(DEBUG_ELEMENTS) {
0143 Out.println("startElement: " + element.localpart);
0144 }
0145 // Fire the status listener if the elements processed exceded the
0146 // rate
0147 if(0 == (++elements % ELEMENTS_RATE))
0148 fireStatusChangedEvent("Processed elements : " + elements);
0149
0150 // Start of ignorable tag
0151 if(ignorableTags.contains(element.localpart)) {
0152 ignorableTagLevels++;
0153 if(DEBUG_ELEMENTS) {
0154 Out.println(" ignorable tag: levels = " + ignorableTagLevels);
0155 }
0156 } // if
0157
0158 // Construct a feature map from the attributes list
0159 FeatureMap fm = Factory.newFeatureMap();
0160
0161 // Take all the attributes an put them into the feature map
0162 for(int i = 0; i < attributes.getLength(); i++) {
0163 if(DEBUG_ELEMENTS) {
0164 Out.println(" attribute: " + attributes.getLocalName(i) + " = "
0165 + attributes.getValue(i));
0166 }
0167 fm.put(attributes.getLocalName(i), attributes.getValue(i));
0168 }
0169
0170 // Just analize the tag and add some\n chars and spaces to the
0171 // tmpDocContent.The reason behind is that we need to have a
0172 // readable form
0173 // for the final document.
0174 customizeAppearanceOfDocumentWithStartTag(element.localpart);
0175
0176 // create the start index of the annotation
0177 Long startIndex = new Long(tmpDocContent.length());
0178
0179 // initialy the start index is equal with the End index
0180 CustomObject obj = new CustomObject(element.localpart, fm, startIndex,
0181 startIndex);
0182
0183 // put it into the stack
0184 stack.push(obj);
0185
0186 }
0187
0188 /**
0189 * Called when the parser encounters character or CDATA content.
0190 * Characters may be reported in more than one chunk, so we gather all
0191 * contiguous chunks together and process them in one block.
0192 */
0193 public void characters(XMLString text, Augmentations augs)
0194 throws XNIException {
0195 if(!readCharacterStatus) {
0196 if(reposInfo != null) {
0197 HTMLEventInfo evInfo = (augs == null) ? null : (HTMLEventInfo)augs
0198 .getItem(AUGMENTATIONS);
0199 if(evInfo == null) {
0200 Err.println("Warning: could not determine proper repositioning "
0201 + "info for character chunk \""
0202 + new String(text.ch, text.offset, text.length)
0203 + "\" near offset " + charactersStartOffset
0204 + ". Save preserving format may give incorret results.");
0205 }
0206 else {
0207 // NekoHTML numbers lines and columns from 1, not 0
0208 int line = evInfo.getBeginLineNumber() - 1;
0209 int col = evInfo.getBeginColumnNumber() - 1;
0210 charactersStartOffset = lineOffsets[line] + col;
0211 if(DEBUG_CHARACTERS) {
0212 Out.println("characters: line = " + line + " (offset " +
0213 lineOffsets[line] + "), col = " + col + " : file offset = " +
0214 charactersStartOffset);
0215 }
0216 }
0217 }
0218
0219 contentBuffer = new StringBuilder();
0220 }
0221 readCharacterStatus = true;
0222
0223 boolean canAppendWS = (contentBuffer.length() == 0 || !Character
0224 .isWhitespace(contentBuffer.charAt(contentBuffer.length() - 1)));
0225 // we must collapse
0226 // whitespace down to a single space, to mirror the normal
0227 // HtmlDocumentFormat.
0228 for(int i = text.offset; i < text.offset + text.length; ++i) {
0229 if(!Character.isWhitespace(text.ch[i])) {
0230 contentBuffer.append(text.ch[i]);
0231 canAppendWS = true;
0232 }
0233 else {
0234 if(canAppendWS) {
0235 contentBuffer.append(' ');
0236 canAppendWS = false;
0237 }
0238 }
0239 }
0240 }
0241
0242 /**
0243 * Called when all text between two tags has been processed.
0244 */
0245 public void charactersAction() throws XNIException {
0246 // check whether there are actually any characters to process
0247 if(!readCharacterStatus) {
0248 return;
0249 }
0250 readCharacterStatus = false;
0251
0252 if(DEBUG_CHARACTERS) {
0253 Out.println("charactersAction: offset = " + charactersStartOffset);
0254 }
0255
0256 if(contentBuffer.length() == 0) return;
0257
0258 // Skip ignorable tag content
0259 if(ignorableTagLevels > 0) {
0260 if(DEBUG_CHARACTERS) {
0261 Out.println(" inside ignorable tag, skipping");
0262 }
0263 return;
0264 }
0265
0266 // the number of whitespace characters trimmed off the front of this
0267 // chunk of characters
0268 boolean thisChunkStartsWithWS = Character.isWhitespace(contentBuffer.charAt(0));
0269
0270 // trim leading whitespace
0271 if(thisChunkStartsWithWS) {
0272 contentBuffer.deleteCharAt(0);
0273 }
0274
0275 if(contentBuffer.length() == 0) {
0276 if(DEBUG_CHARACTERS) {
0277 Out.println(" whitespace only: ignoring");
0278 }
0279 // if this chunk starts with whitespace and is whitespace only, then
0280 // it ended with whitespace too
0281 previousChunkEndedWithWS = thisChunkStartsWithWS;
0282 return;
0283 } // if
0284
0285 // trim trailing whitespace
0286 boolean trailingWhitespace = Character.isWhitespace(contentBuffer.charAt(contentBuffer.length() - 1));
0287 if(trailingWhitespace) {
0288 contentBuffer.setLength(contentBuffer.length() - 1);
0289 }
0290
0291 if(DEBUG_CHARACTERS) {
0292 Out.println(" content = \"" + contentBuffer + "\"");
0293 }
0294
0295 int tmpDocContentSize = tmpDocContent.length();
0296 boolean incrementStartIndex = false;
0297 // correct for whitespace. Since charactersAction never leaves
0298 // tmpDocContent with a trailing whitespace character, we may
0299 // need to add space before we append the current chunk to prevent
0300 // two chunks either side of a tag from running into one. We need
0301 // to do this if there is whitespace in the original content on
0302 // one side or other of the tag (i.e. the previous chunk ended
0303 // with space or the current chunk starts with space). Also, if
0304 // the user's "add space on markup unpack" option is true, we add
0305 // space anyway so as not to run things like
0306 // "...foo</td><td>bar..." together into "foobar".
0307 if(tmpDocContentSize != 0
0308 && !Character.isWhitespace(tmpDocContent
0309 .charAt(tmpDocContentSize - 1))
0310 && (previousChunkEndedWithWS || thisChunkStartsWithWS || addSpaceOnUnpack)) {
0311 if(DEBUG_CHARACTERS) {
0312 Out
0313 .println(String
0314 .format(
0315 " non-whitespace character %1$x (%1$c) found at end of content, adding space",
0316 (int)tmpDocContent
0317 .charAt(tmpDocContentSize - 1)));
0318 }
0319 tmpDocContent.append(' ');
0320 incrementStartIndex = true;
0321 }// End if
0322 // update the document content
0323
0324 tmpDocContent.append(contentBuffer);
0325
0326 // put the repositioning information
0327 if(reposInfo != null) {
0328 long actualStartOffset = charactersStartOffset;
0329 if(thisChunkStartsWithWS) {
0330 actualStartOffset = fixStartOffsetForWhitespace(actualStartOffset);
0331 }
0332 int extractedPos = tmpDocContentSize;
0333 if(incrementStartIndex) extractedPos++;
0334 addRepositioningInfo(contentBuffer.length(), (int)actualStartOffset,
0335 extractedPos);
0336 } // if
0337
0338 // calculate the End index for all the elements of the stack
0339 // the expression is : End index = Current doc length + text length
0340 Long end = new Long(tmpDocContent.length());
0341
0342 CustomObject obj = null;
0343 // Iterate through stack to modify the End index of the existing
0344 // elements
0345
0346 java.util.Iterator<CustomObject> anIterator = stack.iterator();
0347 while(anIterator.hasNext()) {
0348 // get the object and move to the next one
0349 obj = anIterator.next();
0350 if(incrementStartIndex && obj.getStart().equals(obj.getEnd())) {
0351 obj.setStart(new Long(obj.getStart().longValue() + 1));
0352 }// End if
0353 // sets its End index
0354 obj.setEnd(end);
0355 }// End while
0356
0357 // remember whether this chunk ended with whitespace for next time
0358 previousChunkEndedWithWS = trailingWhitespace;
0359 }
0360
0361 /**
0362 * Called when the parser encounters the end of an element.
0363 */
0364 public void endElement(QName element, Augmentations augs) throws XNIException {
0365 endElement(element, augs, false);
0366 }
0367
0368 /**
0369 * Called to signal an empty element. This simply synthesizes a
0370 * startElement followed by an endElement event.
0371 */
0372 public void emptyElement(QName element, XMLAttributes attributes,
0373 Augmentations augs) throws XNIException {
0374 this.startElement(element, attributes, augs);
0375 this.endElement(element, augs, true);
0376 }
0377
0378 /**
0379 * Called when the parser encounters the end of an HTML element.
0380 */
0381 public void endElement(QName element, Augmentations augs,
0382 boolean wasEmptyElement) throws XNIException {
0383 charactersAction();
0384
0385 // localName = localName.toLowerCase();
0386 if(DEBUG_ELEMENTS) {
0387 Out.println("endElement: " + element.localpart + " (was "
0388 + (wasEmptyElement ? "" : "not ") + "empty)");
0389 }
0390
0391 // obj is for internal use
0392 CustomObject obj = null;
0393
0394 // end of ignorable tag
0395 if(ignorableTags.contains(element.localpart)) {
0396 ignorableTagLevels--;
0397 if(DEBUG_ELEMENTS) {
0398 Out.println(" end of ignorable tag. levels = " + ignorableTagLevels);
0399 }
0400 } // if
0401
0402 // If the stack is not empty then we get the object from the stack
0403 if(!stack.isEmpty()) {
0404 obj = (CustomObject)stack.pop();
0405 // Before adding it to the colector, we need to check if is an
0406 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
0407 // We only set isEmptyAndSpan if this endElement was NOT generated
0408 // from an empty element in the HTML.
0409 if(obj.getStart().equals(obj.getEnd()) && !wasEmptyElement) {
0410 // The element had an end tag and its start was equal to its
0411 // end. Hence it is anEmptyAndSpan one.
0412 obj.getFM().put("isEmptyAndSpan", "true");
0413 }// End iff
0414 // we add it to the colector
0415 colector.add(obj);
0416 }// End if
0417
0418 // If element has text between, then customize its apearance
0419 if(obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
0420 // Customize the appearance of the document
0421 customizeAppearanceOfDocumentWithEndTag(element.localpart);
0422 }
0423
0424 /**
0425 * Called when the parser reaches the end of the document. Here we
0426 * store the new content and construct the Original markups
0427 * annotations.
0428 */
0429 public void endDocument(Augmentations augs) throws XNIException {
0430 if(DEBUG_GENERAL) {
0431 Out.println("endDocument");
0432 }
0433 CustomObject obj = null;
0434 // replace the old content with the new one
0435 doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
0436
0437 // If basicAs is null then get the default annotation
0438 // set from this gate document
0439 if(basicAS == null)
0440 basicAS = doc
0441 .getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
0442
0443 // sort colector ascending on its id
0444 Collections.sort(colector);
0445 // iterate through colector and construct annotations
0446 while(!colector.isEmpty()) {
0447 obj = colector.getFirst();
0448 colector.remove(obj);
0449 // Construct an annotation from this obj
0450 try {
0451 basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj
0452 .getFM());
0453 }
0454 catch(InvalidOffsetException e) {
0455 Err.prln("Error creating an annot :" + obj + " Discarded...");
0456 }// end try
0457 // }// end if
0458 }// while
0459
0460 // notify the listener about the total amount of elements that
0461 // has been processed
0462 fireStatusChangedEvent("Total elements : " + elements);
0463 }
0464
0465 /**
0466 * Non-fatal error, print the stack trace but continue processing.
0467 */
0468 public void error(String domain, String key, XMLParseException e) {
0469 e.printStackTrace(Err.getPrintWriter());
0470 }
0471
0472 public void fatalError(String domain, String key, XMLParseException e)
0473 throws XNIException {
0474 throw e;
0475 }
0476
0477 // we don't do anything with processing instructions, comments or CDATA
0478 // markers, but if we encounter them they interrupt the flow of text. Thus
0479 // we must call charactersAction so the repositioning info is correctly
0480 // generated.
0481
0482 public void processingInstruction(String target, XMLString data,
0483 Augmentations augs) throws XNIException {
0484 charactersAction();
0485 }
0486
0487 public void comment(XMLString content,
0488 Augmentations augs) throws XNIException {
0489 charactersAction();
0490 }
0491
0492 public void startCDATA(Augmentations augs) throws XNIException {
0493 charactersAction();
0494 }
0495
0496 public void endCDATA(Augmentations augs) throws XNIException {
0497 charactersAction();
0498 }
0499
0500
0501 /**
0502 * A comparator that compares two RepositioningInfo.PositionInfo
0503 * records by their originalPosition values. It also supports either
0504 * or both argument being a Long, in which case the Long value is used
0505 * directly. This allows you to binarySearch for an offset rather than
0506 * having to construct a PositionInfo record with the target value.
0507 */
0508 private static final Comparator<Object> POSITION_INFO_COMPARATOR = new Comparator<Object>() {
0509 public int compare(Object a, Object b) {
0510 Long offA = null;
0511 if(a instanceof Long) {
0512 offA = (Long)a;
0513 }
0514 else if(a instanceof RepositioningInfo.PositionInfo) {
0515 offA = ((RepositioningInfo.PositionInfo)a).getOriginalPosition();
0516 }
0517
0518 Long offB = null;
0519 if(b instanceof Long) {
0520 offB = (Long)b;
0521 }
0522 else if(b instanceof RepositioningInfo.PositionInfo) {
0523 offB = ((RepositioningInfo.PositionInfo)a).getOriginalPosition();
0524 }
0525
0526 return offA.compareTo(offB);
0527 }
0528 };
0529
0530 /**
0531 * Correct for whitespace. Given the offset of the start of a block of
0532 * whitespace in the original content, this method calculates the
0533 * offset of the first following non-whitespace character. If wsOffset
0534 * points to the start of a run of whitespace then there will be a
0535 * PositionInfo record in the ampCodingInfo that represents this run
0536 * of whitespace, from which we can find the end of the run. If there
0537 * is no PositionInfo record for this offset then it must point to a
0538 * single whitespace character, so we simply return wsOffset+1.
0539 */
0540 private long fixStartOffsetForWhitespace(long wsOffset) {
0541 // see whether we have a repositioning record in ampCodingInfo for
0542 // the whitespace starting at wsOffset
0543 int wsPosInfoIndex = Collections.binarySearch(ampCodingInfo, wsOffset,
0544 POSITION_INFO_COMPARATOR);
0545
0546 // if we don't find a repos record it means that the whitespace
0547 // really is a single space in the original content
0548 if(wsPosInfoIndex < 0) {
0549 return wsOffset + 1;
0550 }
0551 // if there is a repos record we move by the record's originalLength
0552 else {
0553 return wsOffset
0554 + ((RepositioningInfo.PositionInfo)ampCodingInfo
0555 .get(wsPosInfoIndex)).getOriginalLength();
0556 }
0557 }
0558
0559 /**
0560 * For given content the list with shrink position information is
0561 * searched and on the corresponding positions the correct
0562 * repositioning information is calculated and generated.
0563 */
0564 public void addRepositioningInfo(int contentLength, int pos, int extractedPos) {
0565 // wrong way (without correction and analysing)
0566 // reposInfo.addPositionInfo(pos, contentLength, extractedPos,
0567 // contentLength);
0568
0569 RepositioningInfo.PositionInfo pi = null;
0570 long startPos = pos;
0571 long correction = 0;
0572 long substituteStart;
0573 long remainingLen;
0574 long offsetInExtracted;
0575
0576 for(int i = 0; i < ampCodingInfo.size(); ++i) {
0577 pi = (RepositioningInfo.PositionInfo)ampCodingInfo.get(i);
0578 substituteStart = pi.getOriginalPosition();
0579
0580 if(substituteStart >= startPos) {
0581 if(substituteStart > pos + contentLength + correction) {
0582 break; // outside the current text
0583 } // if
0584
0585 // should create two repositioning information records
0586 remainingLen = substituteStart - (startPos + correction);
0587 offsetInExtracted = startPos - pos;
0588 if(remainingLen > 0) {
0589 reposInfo.addPositionInfo(startPos + correction, remainingLen,
0590 extractedPos + offsetInExtracted, remainingLen);
0591 } // if
0592 // record for shrank text
0593 reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
0594 extractedPos + offsetInExtracted + remainingLen, pi
0595 .getCurrentLength());
0596 startPos = startPos + remainingLen + pi.getCurrentLength();
0597 correction += pi.getOriginalLength() - pi.getCurrentLength();
0598 } // if
0599 } // for
0600
0601 // there is some text remaining for repositioning
0602 offsetInExtracted = startPos - pos;
0603 remainingLen = contentLength - offsetInExtracted;
0604 if(remainingLen > 0) {
0605 reposInfo.addPositionInfo(startPos + correction, remainingLen,
0606 extractedPos + offsetInExtracted, remainingLen);
0607 } // if
0608 } // addRepositioningInfo
0609
0610 /**
0611 * This method analizes the tag t and adds some \n chars and spaces to
0612 * the tmpDocContent.The reason behind is that we need to have a
0613 * readable form for the final document. This method modifies the
0614 * content of tmpDocContent.
0615 *
0616 * @param t the Html tag encounted by the HTML parser
0617 */
0618 protected void customizeAppearanceOfDocumentWithStartTag(String tagName) {
0619 boolean modification = false;
0620 int tmpDocContentSize = tmpDocContent.length();
0621 if("p".equals(tagName)) {
0622 if(tmpDocContentSize >= 2
0623 && '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)) {
0624 tmpDocContent.append("\n");
0625 modification = true;
0626 }
0627 }// End if
0628 // if the HTML tag is BR then we add a new line character to the
0629 // document
0630 if("br".equals(tagName)) {
0631 tmpDocContent.append("\n");
0632 modification = true;
0633 }// End if
0634
0635 // only add a newline at the start of a div if there isn't already a
0636 // newline induced by something else
0637 if("div".equals(tagName) && tmpDocContentSize > 0
0638 && tmpDocContent.charAt(tmpDocContentSize - 1) != '\n') {
0639 tmpDocContent.append("\n");
0640 modification = true;
0641 }
0642
0643 if(modification == true) {
0644 Long end = new Long(tmpDocContent.length());
0645 java.util.Iterator<CustomObject> anIterator = stack.iterator();
0646 while(anIterator.hasNext()) {
0647 // get the object and move to the next one, and set its end
0648 // index
0649 anIterator.next().setEnd(end);
0650 }// End while
0651 }// End if
0652 }// customizeAppearanceOfDocumentWithStartTag
0653
0654 /**
0655 * This method analizes the tag t and adds some \n chars and spaces to
0656 * the tmpDocContent.The reason behind is that we need to have a
0657 * readable form for the final document. This method modifies the
0658 * content of tmpDocContent.
0659 *
0660 * @param t the Html tag encounted by the HTML parser
0661 */
0662 protected void customizeAppearanceOfDocumentWithEndTag(String tagName) {
0663 boolean modification = false;
0664 // if the HTML tag is BR then we add a new line character to the
0665 // document
0666 if(("p".equals(tagName)) || ("h1".equals(tagName))
0667 || ("h2".equals(tagName)) || ("h3".equals(tagName))
0668 || ("h4".equals(tagName)) || ("h5".equals(tagName))
0669 || ("h6".equals(tagName)) || ("tr".equals(tagName))
0670 || ("center".equals(tagName)) || ("li".equals(tagName))) {
0671 tmpDocContent.append("\n");
0672 modification = true;
0673 }
0674 // only add a newline at the end of a div if there isn't already a
0675 // newline induced by something else
0676 if("div".equals(tagName) && tmpDocContent.length() > 0
0677 && tmpDocContent.charAt(tmpDocContent.length() - 1) != '\n') {
0678 tmpDocContent.append("\n");
0679 modification = true;
0680 }
0681
0682 if("title".equals(tagName)) {
0683 tmpDocContent.append("\n\n");
0684 modification = true;
0685 }// End if
0686
0687 if(modification == true) {
0688 Long end = new Long(tmpDocContent.length());
0689 java.util.Iterator anIterator = stack.iterator();
0690 while(anIterator.hasNext()) {
0691 // get the object and move to the next one
0692 CustomObject obj = (CustomObject)anIterator.next();
0693 // sets its End index
0694 obj.setEnd(end);
0695 }// End while
0696 }// End if
0697 }// customizeAppearanceOfDocumentWithEndTag
0698
0699 /** Keep the refference to this structure */
0700 private RepositioningInfo reposInfo = null;
0701
0702 /** Keep the refference to this structure */
0703 private RepositioningInfo ampCodingInfo = null;
0704
0705 /**
0706 * Set repositioning information structure refference. If you set this
0707 * refference to <B>null</B> information wouldn't be collected.
0708 */
0709 public void setRepositioningInfo(RepositioningInfo info) {
0710 reposInfo = info;
0711 } // setRepositioningInfo
0712
0713 /** Return current RepositioningInfo object */
0714 public RepositioningInfo getRepositioningInfo() {
0715 return reposInfo;
0716 } // getRepositioningInfo
0717
0718 /**
0719 * Set repositioning information structure refference for ampersand
0720 * coding. If you set this refference to <B>null</B> information
0721 * wouldn't be used.
0722 */
0723 public void setAmpCodingInfo(RepositioningInfo info) {
0724 ampCodingInfo = info;
0725 } // setRepositioningInfo
0726
0727 /** Return current RepositioningInfo object for ampersand coding. */
0728 public RepositioningInfo getAmpCodingInfo() {
0729 return ampCodingInfo;
0730 } // getRepositioningInfo
0731
0732 /**
0733 * The HTML tag names (lower case) whose text content should be
0734 * ignored completely by this handler. Typically this is just script
0735 * and style tags.
0736 */
0737 private Set<String> ignorableTags = null;
0738
0739 /**
0740 * Set the set of tag names whose text content will be ignored.
0741 *
0742 * @param newTags a set of lower-case tag names
0743 */
0744 public void setIgnorableTags(Set<String> newTags) {
0745 ignorableTags = newTags;
0746 }
0747
0748 /**
0749 * Get the set of tag names whose content is ignored by this handler.
0750 */
0751 public Set<String> getIgnorableTags() {
0752 return ignorableTags;
0753 }
0754
0755 // HtmlDocumentHandler member data
0756
0757 // counter for the number of levels of ignorable tag we are inside.
0758 // For example, if we configured "ul" as an ignorable tag name then
0759 // this variable would have the following values:
0760 //
0761 // 0: <p>
0762 // 0: This is some text
0763 // 1: <ul>
0764 // 1: <li>
0765 // 1: some more text
0766 // 2: <ul> ...
0767 // 1: </ul>
0768 // 1: </li>
0769 // 0: </ul>
0770 //
0771 // this allows us to support nested ignorables
0772 int ignorableTagLevels = 0;
0773
0774 // this constant indicates when to fire the status listener
0775 // this listener will add an overhead and we don't want a big overhead
0776 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
0777 final static int ELEMENTS_RATE = 128;
0778
0779 /**
0780 * Array holding the character offset of the start of each line in the
0781 * document.
0782 */
0783 private int[] lineOffsets;
0784
0785 // the content of the HTML document, without any tag
0786 // for internal use
0787 private StringBuilder tmpDocContent = null;
0788
0789 /**
0790 * This is used to capture all data within two tags before calling the
0791 * actual characters method
0792 */
0793 private StringBuilder contentBuffer = new StringBuilder("");
0794
0795 /** This is a variable that shows if characters have been read */
0796 private boolean readCharacterStatus = false;
0797
0798 /**
0799 * The start offset of the current block of character content.
0800 */
0801 private int charactersStartOffset;
0802
0803 // a stack used to remember elements and to keep the order
0804 private java.util.Stack<CustomObject> stack = null;
0805
0806 // a gate document
0807 private gate.Document doc = null;
0808
0809 // an annotation set used for creating annotation reffering the doc
0810 private gate.AnnotationSet basicAS;
0811
0812 // listeners for status report
0813 protected List<StatusListener> myStatusListeners = new LinkedList<StatusListener>();
0814
0815 // this reports the the number of elements that have beed processed so
0816 // far
0817 private int elements = 0;
0818
0819 protected int customObjectsId = 0;
0820
0821 public int getCustomObjectsId() {
0822 return customObjectsId;
0823 }
0824
0825 // we need a colection to retain all the CustomObjects that will be
0826 // transformed into annotation over the gate document...
0827 // the transformation will take place inside onDocumentEnd() method
0828 private LinkedList<CustomObject> colector = null;
0829
0830 /**
0831 * Initialised from the user config, stores whether to add extra space
0832 * characters to separate words that would otherwise be run together,
0833 * e.g. "...foo</td><td>bar...". If true, this becomes
0834 * "foo bar", if false it is "foobar".
0835 */
0836 protected boolean addSpaceOnUnpack = true;
0837
0838 /**
0839 * During parsing, keeps track of whether the previous chunk of
0840 * character data ended with a whitespace character.
0841 */
0842 protected boolean previousChunkEndedWithWS = false;
0843
0844 // Inner class
0845 /**
0846 * The objects belonging to this class are used inside the stack. This
0847 * class is for internal needs
0848 */
0849 class CustomObject implements Comparable<CustomObject> {
0850
0851 // constructor
0852 public CustomObject(String anElemName, FeatureMap aFm, Long aStart,
0853 Long anEnd) {
0854 elemName = anElemName;
0855 fm = aFm;
0856 start = aStart;
0857 end = anEnd;
0858 id = new Long(customObjectsId++);
0859 }// End CustomObject()
0860
0861 // Methos implemented as required by Comparable interface
0862 public int compareTo(CustomObject obj) {
0863 return this.id.compareTo(obj.getId());
0864 }// compareTo();
0865
0866 // accesor
0867 public String getElemName() {
0868 return elemName;
0869 }// getElemName()
0870
0871 public FeatureMap getFM() {
0872 return fm;
0873 }// getFM()
0874
0875 public Long getStart() {
0876 return start;
0877 }// getStart()
0878
0879 public Long getEnd() {
0880 return end;
0881 }// getEnd()
0882
0883 public Long getId() {
0884 return id;
0885 }
0886
0887 // mutator
0888 public void setElemName(String anElemName) {
0889 elemName = anElemName;
0890 }// getElemName()
0891
0892 public void setFM(FeatureMap aFm) {
0893 fm = aFm;
0894 }// setFM();
0895
0896 public void setStart(Long aStart) {
0897 start = aStart;
0898 }// setStart();
0899
0900 public void setEnd(Long anEnd) {
0901 end = anEnd;
0902 }// setEnd();
0903
0904 // data fields
0905 private String elemName = null;
0906
0907 private FeatureMap fm = null;
0908
0909 private Long start = null;
0910
0911 private Long end = null;
0912
0913 private Long id = null;
0914
0915 } // End inner class CustomObject
0916
0917 // StatusReporter Implementation
0918
0919 public void addStatusListener(StatusListener listener) {
0920 myStatusListeners.add(listener);
0921 }
0922
0923 public void removeStatusListener(StatusListener listener) {
0924 myStatusListeners.remove(listener);
0925 }
0926
0927 protected void fireStatusChangedEvent(String text) {
0928 Iterator<StatusListener> listenersIter = myStatusListeners.iterator();
0929 while(listenersIter.hasNext())
0930 listenersIter.next().statusChanged(text);
0931 }
0932
0933 // //// Unused methods from XNI interfaces //////
0934
0935 public void doctypeDecl(String arg0, String arg1, String arg2,
0936 Augmentations arg3) throws XNIException {
0937 if(DEBUG_UNUSED) {
0938 Out.println("doctypeDecl");
0939 }
0940 }
0941
0942 public void endGeneralEntity(String arg0, Augmentations arg1)
0943 throws XNIException {
0944 if(DEBUG_UNUSED) {
0945 Out.println("endGeneralEntity");
0946 }
0947 }
0948
0949 public XMLDocumentSource getDocumentSource() {
0950 if(DEBUG_UNUSED) {
0951 Out.println("getDocumentSource");
0952 }
0953 return null;
0954 }
0955
0956 public void ignorableWhitespace(XMLString arg0, Augmentations arg1)
0957 throws XNIException {
0958 if(DEBUG_UNUSED) {
0959 Out.println("ignorableWhitespace: " + arg0);
0960 }
0961 }
0962
0963 public void setDocumentSource(XMLDocumentSource arg0) {
0964 if(DEBUG_UNUSED) {
0965 Out.println("setDocumentSource");
0966 }
0967 }
0968
0969 public void startDocument(XMLLocator arg0, String arg1,
0970 NamespaceContext arg2, Augmentations arg3) throws XNIException {
0971 if(DEBUG_UNUSED) {
0972 Out.println("startDocument");
0973 }
0974 }
0975
0976 public void startGeneralEntity(String arg0, XMLResourceIdentifier arg1,
0977 String arg2, Augmentations arg3) throws XNIException {
0978 if(DEBUG_UNUSED) {
0979 Out.println("startGeneralEntity");
0980 }
0981 }
0982
0983 public void textDecl(String arg0, String arg1, Augmentations arg2)
0984 throws XNIException {
0985 if(DEBUG_UNUSED) {
0986 Out.println("textDecl");
0987 }
0988 }
0989
0990 public void xmlDecl(String arg0, String arg1, String arg2, Augmentations arg3)
0991 throws XNIException {
0992 if(DEBUG_UNUSED) {
0993 Out.println("xmlDecl");
0994 }
0995 }
0996
0997 public void warning(String arg0, String arg1, XMLParseException arg2)
0998 throws XNIException {
0999 if(DEBUG_GENERAL) {
1000 Out.println("warning:");
1001 arg2.printStackTrace(Err.getPrintWriter());
1002 }
1003 }
1004
1005 }
|