001 /*
002 * HtmlDocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 26/May/2000
013 *
014 * $Id: HtmlDocumentFormat.java 12919 2010-08-03 10:31:37Z valyt $
015 */
016
017 package gate.corpora;
018
019 import java.io.*;
020 import java.net.URLConnection;
021
022 import javax.swing.text.html.HTMLEditorKit;
023 import javax.swing.text.html.parser.ParserDelegator;
024
025 import gate.Document;
026 import gate.Resource;
027 import gate.creole.ResourceInstantiationException;
028 import gate.event.StatusListener;
029 import gate.html.HtmlDocumentHandler;
030 import gate.util.DocumentFormatException;
031
032 //import org.w3c.www.mime.*;
033
034 /** The format of Documents. Subclasses of DocumentFormat know about
035 * particular MIME types and how to unpack the information in any
036 * markup or formatting they contain into GATE annotations. Each MIME
037 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
038 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
039 * with a static index residing here when they are constructed. Static
040 * getDocumentFormat methods can then be used to get the appropriate
041 * format class for a particular document.
042 */
043 public class HtmlDocumentFormat extends TextualDocumentFormat
044 {
045
046 /** Debug flag */
047 private static final boolean DEBUG = false;
048
049 /** Default construction */
050 public HtmlDocumentFormat() { super(); }
051
052 /** We could collect repositioning information during XML parsing */
053 public Boolean supportsRepositioning() {
054 return new Boolean(true);
055 } // supportsRepositioning
056
057 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
058 public void unpackMarkup(Document doc) throws DocumentFormatException {
059 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
060 } // unpackMarkup
061
062 /** Unpack the markup in the document. This converts markup from the
063 * native format (e.g. HTML) into annotations in GATE format.
064 * Uses the markupElementsMap to determine which elements to convert, and
065 * what annotation type names to use.
066 * It always tryes to parse te doc's content. It doesn't matter if the
067 * sourceUrl is null or not.
068 *
069 * @param doc The gate document you want to parse.
070 *
071 */
072 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
073 RepositioningInfo ampCodingInfo) throws DocumentFormatException{
074 Reader reader = null;
075 URLConnection conn = null;
076 PrintWriter out = null;
077 HTMLEditorKit.Parser parser = new ParserDelegator();
078
079 if ( doc == null || doc.getContent() == null ){
080 throw new DocumentFormatException(
081 "GATE document is null or no content found. Nothing to parse!");
082 }// End if
083
084 reader = new StringReader(doc.getContent().toString());
085
086 // create a new Htmldocument handler
087 HtmlDocumentHandler htmlDocHandler = new
088 HtmlDocumentHandler(doc, this.markupElementsMap);
089 // Create a Status Listener
090 StatusListener statusListener = new StatusListener(){
091 public void statusChanged(String text){
092 fireStatusChanged(text);
093 }
094 };
095 // Register the listener with htmlDocHandler
096 htmlDocHandler.addStatusListener(statusListener);
097 // set repositioning object
098 htmlDocHandler.setRepositioningInfo(repInfo);
099 // set the object with ampersand coding positions
100 htmlDocHandler.setAmpCodingInfo(ampCodingInfo);
101
102 try{
103 // parse the HTML document
104 parser.parse(reader, htmlDocHandler, true);
105 } catch (IOException e){
106 throw new DocumentFormatException(e);
107 }finally{
108 if (htmlDocHandler != null)
109 htmlDocHandler.removeStatusListener(statusListener);
110 }// End try
111 }//unpackMarkup(doc)
112
113 /** Initialise this resource, and return it. */
114 public Resource init() throws ResourceInstantiationException{
115 // Register HTML mime type
116 MimeType mime = new MimeType("text","html");
117 // Register the class handler for this mime type
118 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
119 this);
120 // Register the mime type with mine string
121 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
122 // Register file sufixes for this mime type
123 suffixes2mimeTypeMap.put("html",mime);
124 suffixes2mimeTypeMap.put("htm",mime);
125 // Register magic numbers for this mime type
126 magic2mimeTypeMap.put("<html",mime);
127 // Set the mimeType for this language resource
128 setMimeType(mime);
129 return this;
130 }// init()
131 }// class HtmlDocumentFormat
|