001 /*
002 * HtmlDocumentHandler.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 12/June/2000
013 *
014 * $Id: HtmlDocumentHandler.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.html;
018
019 import java.util.*;
020
021 import javax.swing.text.BadLocationException;
022 import javax.swing.text.MutableAttributeSet;
023 import javax.swing.text.html.HTML;
024 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
025
026 import gate.*;
027 import gate.corpora.DocumentContentImpl;
028 import gate.corpora.RepositioningInfo;
029 import gate.event.StatusListener;
030 import gate.util.Err;
031 import gate.util.InvalidOffsetException;
032
033
034 /** Implements the behaviour of the HTML reader.
035 * Methods of an object of this class are called by the HTML parser when
036 * events will appear.
037 * The idea is to parse the HTML document and construct Gate annotations
038 * objects.
039 * This class also will replace the content of the Gate document with a
040 * new one containing anly text from the HTML document.
041 */
042 public class HtmlDocumentHandler extends ParserCallback {
043
044 /** Debug flag */
045 private static final boolean DEBUG = false;
046
047 /** Constructor initialises all the private memeber data.
048 * This will use the default annotation set taken from the gate document.
049 * @param aDocument The gate document that will be processed
050 * @param aMarkupElementsMap The map containing the elements that will
051 * transform into annotations
052 */
053 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
054 this(aDocument,aMarkupElementsMap,null);
055 }
056
057 /** Constructor initialises all the private memeber data
058 * @param aDocument The gate document that will be processed
059 * @param aMarkupElementsMap The map containing the elements that will
060 * transform into annotations
061 * @param anAnnotationSet The annotation set that will contain annotations
062 * resulted from the processing of the gate document
063 */
064 public HtmlDocumentHandler(gate.Document aDocument,
065 Map aMarkupElementsMap,
066 gate.AnnotationSet anAnnotationSet) {
067 // init stack
068 stack = new java.util.Stack();
069
070 // this string contains the plain text (the text without markup)
071 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
072
073 // colector is used later to transform all custom objects into
074 // annotation objects
075 colector = new LinkedList();
076
077 // the Gate document
078 doc = aDocument;
079
080 // this map contains the elements name that we want to create
081 // if it's null all the elements from the XML documents will be transformed
082 // into Gate annotation objects
083 markupElementsMap = aMarkupElementsMap;
084
085 // init an annotation set for this gate document
086 basicAS = anAnnotationSet;
087
088 customObjectsId = 0;
089 }//HtmlDocumentHandler
090
091 /** Keep the refference to this structure */
092 private RepositioningInfo reposInfo = null;
093
094 /** Keep the refference to this structure */
095 private RepositioningInfo ampCodingInfo = null;
096
097 /** Set repositioning information structure refference. If you set this
098 * refference to <B>null</B> information wouldn't be collected.
099 */
100 public void setRepositioningInfo(RepositioningInfo info) {
101 reposInfo = info;
102 } // setRepositioningInfo
103
104 /** Return current RepositioningInfo object */
105 public RepositioningInfo getRepositioningInfo() {
106 return reposInfo;
107 } // getRepositioningInfo
108
109 /** Set repositioning information structure refference for ampersand coding.
110 * If you set this refference to <B>null</B> information wouldn't be used.
111 */
112 public void setAmpCodingInfo(RepositioningInfo info) {
113 ampCodingInfo = info;
114 } // setRepositioningInfo
115
116 /** Return current RepositioningInfo object for ampersand coding. */
117 public RepositioningInfo getAmpCodingInfo() {
118 return ampCodingInfo;
119 } // getRepositioningInfo
120
121 /** The text inside the STYLE tag is processed with <code>handleText()</code>.
122 * We should skip inserting of this text in the document. */
123 private boolean isInsideStyleTag = false;
124
125 /** This method is called when the HTML parser encounts the beginning
126 * of a tag that means that the tag is paired by an end tag and it's
127 * not an empty one.
128 */
129 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
130 // Fire the status listener if the elements processed exceded the rate
131 if (0 == (++elements % ELEMENTS_RATE))
132 fireStatusChangedEvent("Processed elements : " + elements);
133
134 // Start of STYLE tag
135 if(HTML.Tag.STYLE.equals(t)) {
136 isInsideStyleTag = true;
137 } // if
138
139 // Construct a feature map from the attributes list
140 FeatureMap fm = Factory.newFeatureMap();
141
142 // Take all the attributes an put them into the feature map
143 if (0 != a.getAttributeCount()){
144 Enumeration enumeration = a.getAttributeNames();
145 while (enumeration.hasMoreElements()){
146 Object attribute = enumeration.nextElement();
147 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
148 }// while
149 }// if
150
151 // Just analize the tag t and add some\n chars and spaces to the
152 // tmpDocContent.The reason behind is that we need to have a readable form
153 // for the final document.
154 customizeAppearanceOfDocumentWithStartTag(t);
155
156 // If until here the "tmpDocContent" ends with a NON whitespace char,
157 // then we add a space char before calculating the START index of this
158 // tag.
159 // This is done in order not to concatenate the content of two separate tags
160 // and obtain a different NEW word.
161 int tmpDocContentSize = tmpDocContent.length();
162 if ( tmpDocContentSize != 0 &&
163 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
164 ) tmpDocContent.append(" ");
165
166 // create the start index of the annotation
167 Long startIndex = new Long(tmpDocContent.length());
168
169 // initialy the start index is equal with the End index
170 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
171
172 // put it into the stack
173 stack.push (obj);
174
175 }//handleStartTag
176
177 /** This method is called when the HTML parser encounts the end of a tag
178 * that means that the tag is paired by a beginning tag
179 */
180 public void handleEndTag(HTML.Tag t, int pos){
181 // obj is for internal use
182 CustomObject obj = null;
183
184 // end of STYLE tag
185 if(HTML.Tag.STYLE.equals(t)) {
186 isInsideStyleTag = false;
187 } // if
188
189 // If the stack is not empty then we get the object from the stack
190 if (!stack.isEmpty()){
191 obj = (CustomObject) stack.pop();
192 // Before adding it to the colector, we need to check if is an
193 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
194 if (obj.getStart().equals(obj.getEnd())){
195 // The element had an end tag and its start was equal to its end. Hence
196 // it is anEmptyAndSpan one.
197 obj.getFM().put("isEmptyAndSpan","true");
198 }// End iff
199 // we add it to the colector
200 colector.add(obj);
201 }// End if
202
203 // If element has text between, then customize its apearance
204 if ( obj != null &&
205 obj.getStart().longValue() != obj.getEnd().longValue()
206 )
207 // Customize the appearance of the document
208 customizeAppearanceOfDocumentWithEndTag(t);
209
210 // if t is the </HTML> tag then we reached the end of theHTMLdocument
211 if (t == HTML.Tag.HTML){
212 // replace the old content with the new one
213 doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
214
215 // If basicAs is null then get the default annotation
216 // set from this gate document
217 if (basicAS == null)
218 basicAS = doc.getAnnotations(
219 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
220
221 // sort colector ascending on its id
222 Collections.sort(colector);
223 // iterate through colector and construct annotations
224 while (!colector.isEmpty()){
225 obj = (CustomObject) colector.getFirst();
226 colector.remove(obj);
227 // Construct an annotation from this obj
228 try{
229 if (markupElementsMap == null){
230 basicAS.add( obj.getStart(),
231 obj.getEnd(),
232 obj.getElemName(),
233 obj.getFM()
234 );
235 }else{
236 String annotationType =
237 (String) markupElementsMap.get(obj.getElemName());
238 if (annotationType != null)
239 basicAS.add( obj.getStart(),
240 obj.getEnd(),
241 annotationType,
242 obj.getFM()
243 );
244 }
245 }catch (InvalidOffsetException e){
246 Err.prln("Error creating an annot :" + obj + " Discarded...");
247 }// end try
248 // }// end if
249 }//while
250
251 // notify the listener about the total amount of elements that
252 // has been processed
253 fireStatusChangedEvent("Total elements : " + elements);
254
255 }//else
256
257 }//handleEndTag
258
259 /** This method is called when the HTML parser encounts an empty tag
260 */
261 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
262 // fire the status listener if the elements processed exceded the rate
263 if ((++elements % ELEMENTS_RATE) == 0)
264 fireStatusChangedEvent("Processed elements : " + elements);
265
266 // construct a feature map from the attributes list
267 // these are empty elements
268 FeatureMap fm = Factory.newFeatureMap();
269
270 // take all the attributes an put them into the feature map
271 if (0 != a.getAttributeCount ()){
272
273 // Out.println("HAS attributes = " + a.getAttributeCount ());
274 Enumeration enumeration = a.getAttributeNames ();
275 while (enumeration.hasMoreElements ()){
276 Object attribute = enumeration.nextElement ();
277 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
278
279 }//while
280
281 }//if
282
283 // create the start index of the annotation
284 Long startIndex = new Long(tmpDocContent.length());
285
286 // initialy the start index is equal with the End index
287 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
288
289 // we add the object directly into the colector
290 // we don't add it to the stack because this is an empty tag
291 colector.add(obj);
292
293 // Just analize the tag t and add some\n chars and spaces to the
294 // tmpDocContent.The reason behind is that we need to have a readable form
295 // for the final document.
296 customizeAppearanceOfDocumentWithSimpleTag(t);
297
298 } // handleSimpleTag
299
300 /** This method is called when the HTML parser encounts text (PCDATA)
301 */
302 public void handleText(char[] text, int pos){
303
304 // Skip the STYLE tag content
305 if(isInsideStyleTag) return;
306
307 // create a string object based on the reported text
308 String content = new String(text);
309
310 // remove the difference between JDK 1.3 and JDK 1.4
311 String trimContent = content.trim();
312 if(trimContent.length() == 0) {
313 return;
314 } // if
315
316 int trimCorrection = content.indexOf(trimContent.charAt(0));
317 content = trimContent;
318
319 StringBuffer contentBuffer = new StringBuffer("");
320 int tmpDocContentSize = tmpDocContent.length();
321 boolean incrementStartIndex = false;
322 // If the first char of the text just read "text[0]" is NOT whitespace AND
323 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
324 // concatenation "tmpDocContent + content" will result into a new different
325 // word... and we want to avoid that...
326 if ( tmpDocContentSize != 0 &&
327 content.length() != 0 &&
328 !Character.isWhitespace(content.charAt(0)) &&
329 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
330
331 contentBuffer.append(" ");
332 incrementStartIndex = true;
333 }// End if
334 // update the document content
335
336 // put the repositioning information
337 if(reposInfo != null) {
338 int extractedPos = tmpDocContent.length() + contentBuffer.length();
339 addRepositioningInfo(content, pos + trimCorrection, extractedPos);
340 } // if
341
342 contentBuffer.append(content);
343 // calculate the End index for all the elements of the stack
344 // the expression is : End index = Current doc length + text length
345 Long end = new Long(tmpDocContent.length() + contentBuffer.length());
346
347 CustomObject obj = null;
348 // Iterate through stack to modify the End index of the existing elements
349
350 java.util.Iterator anIterator = stack.iterator();
351 while (anIterator.hasNext ()){
352 // get the object and move to the next one
353 obj = (CustomObject) anIterator.next ();
354 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
355 obj.setStart(new Long(obj.getStart().longValue() + 1));
356 }// End if
357 // sets its End index
358 obj.setEnd(end);
359 }// End while
360
361 tmpDocContent.append(contentBuffer.toString());
362 }// end handleText();
363
364 /** For given content the list with shrink position information is searched
365 * and on the corresponding positions the correct repositioning information
366 * is calculated and generated.
367 */
368 public void addRepositioningInfo(String content, int pos, int extractedPos) {
369 int contentLength = content.length();
370
371 // wrong way (without correction and analysing)
372 //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);
373
374 RepositioningInfo.PositionInfo pi = null;
375 long startPos = pos;
376 long correction = 0;
377 long substituteStart;
378 long remainingLen;
379 long offsetInExtracted;
380
381 for(int i = 0; i < ampCodingInfo.size(); ++i) {
382 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
383 substituteStart = pi.getOriginalPosition();
384
385 if(substituteStart >= startPos) {
386 if(substituteStart > pos + contentLength + correction) {
387 break; // outside the current text
388 } // if
389
390 // should create two repositioning information records
391 remainingLen = substituteStart - (startPos + correction);
392 offsetInExtracted = startPos - pos;
393 if(remainingLen > 0) {
394 reposInfo.addPositionInfo(startPos + correction, remainingLen,
395 extractedPos + offsetInExtracted, remainingLen);
396 } // if
397 // record for shrank text
398 reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
399 extractedPos + offsetInExtracted + remainingLen,
400 pi.getCurrentLength());
401 startPos = startPos + remainingLen + pi.getCurrentLength();
402 correction += pi.getOriginalLength() - pi.getCurrentLength();
403 } // if
404 } // for
405
406 // there is some text remaining for repositioning
407 offsetInExtracted = startPos - pos;
408 remainingLen = contentLength - offsetInExtracted;
409 if(remainingLen > 0) {
410 reposInfo.addPositionInfo(startPos + correction, remainingLen,
411 extractedPos + offsetInExtracted, remainingLen);
412 } // if
413 } // addRepositioningInfo
414
415 /** This method analizes the tag t and adds some \n chars and spaces to the
416 * tmpDocContent.The reason behind is that we need to have a readable form
417 * for the final document. This method modifies the content of tmpDocContent.
418 * @param t the Html tag encounted by the HTML parser
419 */
420 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
421 boolean modification = false;
422 // if the HTML tag is BR then we add a new line character to the document
423 if (HTML.Tag.BR == t){
424 tmpDocContent.append("\n");
425 modification = true;
426 }// End if
427 if (modification == true){
428 Long end = new Long (tmpDocContent.length());
429 java.util.Iterator anIterator = stack.iterator();
430 while (anIterator.hasNext ()){
431 // get the object and move to the next one
432 CustomObject obj = (CustomObject) anIterator.next();
433 // sets its End index
434 obj.setEnd(end);
435 }// End while
436 }//End if
437 }// customizeAppearanceOfDocumentWithSimpleTag
438
439 /** This method analizes the tag t and adds some \n chars and spaces to the
440 * tmpDocContent.The reason behind is that we need to have a readable form
441 * for the final document. This method modifies the content of tmpDocContent.
442 * @param t the Html tag encounted by the HTML parser
443 */
444 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
445 boolean modification = false;
446 if (HTML.Tag.P == t){
447 int tmpDocContentSize = tmpDocContent.length();
448 if ( tmpDocContentSize >= 2 &&
449 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
450 ) { tmpDocContent.append("\n"); modification = true;}
451 }// End if
452 if (modification == true){
453 Long end = new Long (tmpDocContent.length());
454 java.util.Iterator anIterator = stack.iterator();
455 while (anIterator.hasNext ()){
456 // get the object and move to the next one
457 CustomObject obj = (CustomObject) anIterator.next();
458 // sets its End index
459 obj.setEnd(end);
460 }// End while
461 }//End if
462 }// customizeAppearanceOfDocumentWithStartTag
463
464 /** This method analizes the tag t and adds some \n chars and spaces to the
465 * tmpDocContent.The reason behind is that we need to have a readable form
466 * for the final document. This method modifies the content of tmpDocContent.
467 * @param t the Html tag encounted by the HTML parser
468 */
469 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
470 boolean modification = false;
471 // if the HTML tag is BR then we add a new line character to the document
472 if ( (HTML.Tag.P == t) ||
473
474 (HTML.Tag.H1 == t) ||
475 (HTML.Tag.H2 == t) ||
476 (HTML.Tag.H3 == t) ||
477 (HTML.Tag.H4 == t) ||
478 (HTML.Tag.H5 == t) ||
479 (HTML.Tag.H6 == t) ||
480 (HTML.Tag.TR == t) ||
481 (HTML.Tag.CENTER == t) ||
482 (HTML.Tag.LI == t)
483 ){ tmpDocContent.append("\n"); modification = true;}
484
485 if (HTML.Tag.TITLE == t){
486 tmpDocContent.append("\n\n");
487 modification = true;
488 }// End if
489
490 if (modification == true){
491 Long end = new Long (tmpDocContent.length());
492 java.util.Iterator anIterator = stack.iterator();
493 while (anIterator.hasNext ()){
494 // get the object and move to the next one
495 CustomObject obj = (CustomObject) anIterator.next();
496 // sets its End index
497 obj.setEnd(end);
498 }// End while
499 }//End if
500 }// customizeAppearanceOfDocumentWithEndTag
501
502 /**
503 * This method is called when the HTML parser encounts an error
504 * it depends on the programmer if he wants to deal with that error
505 */
506 public void handleError(String errorMsg, int pos) {
507 //Out.println ("ERROR CALLED : " + errorMsg);
508 }
509
510 /** This method is called once, when the HTML parser reaches the end
511 * of its input streamin order to notify the parserCallback that there
512 * is nothing more to parse.
513 */
514 public void flush() throws BadLocationException{
515 }// flush
516
517 /** This method is called when the HTML parser encounts a comment
518 */
519 public void handleComment(char[] text, int pos) {
520 }
521
522 //StatusReporter Implementation
523
524 public void addStatusListener(StatusListener listener) {
525 myStatusListeners.add(listener);
526 }
527
528 public void removeStatusListener(StatusListener listener) {
529 myStatusListeners.remove(listener);
530 }
531
532 protected void fireStatusChangedEvent(String text) {
533 Iterator listenersIter = myStatusListeners.iterator();
534 while(listenersIter.hasNext())
535 ((StatusListener)listenersIter.next()).statusChanged(text);
536 }
537
538 /**
539 * This method verifies if data contained by the CustomObject can be used
540 * to create a GATE annotation.
541 */
542 /* private boolean canCreateAnnotation(CustomObject aCustomObject){
543 long start = aCustomObject.getStart().longValue();
544 long end = aCustomObject.getEnd().longValue();
545 long gateDocumentSize = doc.getContent().size().longValue();
546
547 if (start < 0 || end < 0 ) return false;
548 if (start > end ) return false;
549 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
550 return true;
551 }// canCreateAnnotation
552 */
553
554 // HtmlDocumentHandler member data
555
556 // this constant indicates when to fire the status listener
557 // this listener will add an overhead and we don't want a big overhead
558 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
559 final static int ELEMENTS_RATE = 128;
560
561 // this map contains the elements name that we want to create
562 // if it's null all the elements from the HTML documents will be transformed
563 // into Gate annotation objects otherwise only the elements it contains will
564 // be transformed
565 private Map markupElementsMap = null;
566
567 // the content of the HTML document, without any tag
568 // for internal use
569 private StringBuffer tmpDocContent = null;
570
571 // a stack used to remember elements and to keep the order
572 private java.util.Stack stack = null;
573
574 // a gate document
575 private gate.Document doc = null;
576
577 // an annotation set used for creating annotation reffering the doc
578 private gate.AnnotationSet basicAS;
579
580 // listeners for status report
581 protected List myStatusListeners = new LinkedList();
582
583 // this reports the the number of elements that have beed processed so far
584 private int elements = 0;
585
586 protected long customObjectsId = 0;
587 // we need a colection to retain all the CustomObjects that will be
588 // transformed into annotation over the gate document...
589 // the transformation will take place inside onDocumentEnd() method
590 private LinkedList colector = null;
591
592 // Inner class
593 /**
594 * The objects belonging to this class are used inside the stack.
595 * This class is for internal needs
596 */
597 class CustomObject implements Comparable {
598
599 // constructor
600 public CustomObject(String anElemName, FeatureMap aFm,
601 Long aStart, Long anEnd) {
602 elemName = anElemName;
603 fm = aFm;
604 start = aStart;
605 end = anEnd;
606 id = new Long(customObjectsId ++);
607 }// End CustomObject()
608
609 // Methos implemented as required by Comparable interface
610 public int compareTo(Object o){
611 CustomObject obj = (CustomObject) o;
612 return this.id.compareTo(obj.getId());
613 }// compareTo();
614
615 // accesor
616 public String getElemName() {
617 return elemName;
618 }// getElemName()
619
620 public FeatureMap getFM() {
621 return fm;
622 }// getFM()
623
624 public Long getStart() {
625 return start;
626 }// getStart()
627
628 public Long getEnd() {
629 return end;
630 }// getEnd()
631
632 public Long getId(){ return id;}
633
634 // mutator
635 public void setElemName(String anElemName) {
636 elemName = anElemName;
637 }// getElemName()
638
639 public void setFM(FeatureMap aFm) {
640 fm = aFm;
641 }// setFM();
642
643 public void setStart(Long aStart) {
644 start = aStart;
645 }// setStart();
646
647 public void setEnd(Long anEnd) {
648 end = anEnd;
649 }// setEnd();
650
651 // data fields
652 private String elemName = null;
653 private FeatureMap fm = null;
654 private Long start = null;
655 private Long end = null;
656 private Long id = null;
657
658 } // End inner class CustomObject
659
660 }//End class HtmlDocumentHandler
661
662
|