001 /*
002 * NekoHtmlDocumentFormat.java
003 *
004 * Copyright (c) 2006, The University of Sheffield.
005 *
006 * This file is part of GATE (see http://gate.ac.uk/), and is free
007 * software, licenced under the GNU Library General Public License,
008 * Version 2, June 1991 (in the distribution as file licence.html,
009 * and also available at http://gate.ac.uk/gate/licence.html).
010 *
011 * Ian Roberts, 17/Dec/2006
012 *
013 * $Id: NekoHtmlDocumentFormat.java 12919 2010-08-03 10:31:37Z valyt $
014 */
015
016 package gate.corpora;
017
018 import gate.Document;
019 import gate.GateConstants;
020 import gate.Resource;
021 import gate.TextualDocument;
022 import gate.creole.ResourceInstantiationException;
023 import gate.creole.metadata.AutoInstance;
024 import gate.creole.metadata.CreoleParameter;
025 import gate.creole.metadata.CreoleResource;
026 import gate.event.StatusListener;
027 import gate.html.NekoHtmlDocumentHandler;
028 import gate.util.DocumentFormatException;
029 import gate.util.Out;
030
031 import java.io.IOException;
032 import java.io.InputStreamReader;
033 import java.io.Reader;
034 import java.io.StringReader;
035 import java.util.Set;
036 import java.util.regex.Matcher;
037 import java.util.regex.Pattern;
038
039 import org.apache.xerces.xni.XNIException;
040 import org.apache.xerces.xni.parser.XMLInputSource;
041 import org.cyberneko.html.HTMLConfiguration;
042
043 /**
044 * <p>
045 * DocumentFormat that uses Andy Clark's <a
046 * href="http://people.apache.org/~andyc/neko/doc/html/">NekoHTML</a>
047 * parser to parse HTML documents. It tries to render HTML in a similar
048 * way to a web browser, i.e. whitespace is normalized, paragraphs are
049 * separated by a blank line, etc. By default the text content of style
050 * and script tags is ignored completely, though the set of tags treated
051 * in this way is configurable via a CREOLE parameter.
052 * </p>
053 * <p>
054 * This class extends {@link HtmlDocumentFormat} to cause DocumentImpl
055 * to put the necessary whitespace normalization information into the
056 * format's ampCodingInfo.
057 * </p>
058 */
059 @CreoleResource(name = "GATE HTML Document Format", isPrivate = true,
060 autoinstances = {@AutoInstance(hidden = true)})
061 public class NekoHtmlDocumentFormat extends HtmlDocumentFormat {
062 /** Debug flag */
063 private static final boolean DEBUG = false;
064
065 /** Default construction */
066 public NekoHtmlDocumentFormat() {
067 super();
068 }
069
070 /**
071 * The set of tags whose text content is to be ignored when parsing.
072 */
073 private Set<String> ignorableTags = null;
074
075 @CreoleParameter(comment = "HTML tags whose text content should be ignored",
076 defaultValue = "script;style")
077 public void setIgnorableTags(Set<String> newTags) {
078 this.ignorableTags = newTags;
079 }
080
081 public Set<String> getIgnorableTags() {
082 return ignorableTags;
083 }
084
085 /**
086 * We support repositioning info for HTML files.
087 */
088 public Boolean supportsRepositioning() {
089 return Boolean.TRUE;
090 }
091
092 /**
093 * Old-style unpackMarkup, without repositioning info.
094 */
095 public void unpackMarkup(Document doc) throws DocumentFormatException {
096 unpackMarkup(doc, null, null);
097 }
098
099 /**
100 * Unpack the markup in the document. This converts markup from the
101 * native format into annotations in GATE format. If the document was
102 * created from a String, then is recomandable to set the doc's
103 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
104 * then the parser will try to parse the XML document pointed by the
105 * URL.If the URL is not valid, or is null, then the doc's content
106 * will be parsed. If the doc's content is not a valid XML then the
107 * parser might crash.
108 *
109 * @param doc The gate document you want to parse. If
110 * <code>doc.getSourceUrl()</code> returns <b>null</b>
111 * then the content of doc will be parsed. Using a URL is
112 * recomended because the parser will report errors corectlly
113 * if the document is not well formed.
114 */
115 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
116 RepositioningInfo ampCodingInfo) throws DocumentFormatException {
117 if((doc == null)
118 || (doc.getSourceUrl() == null && doc.getContent() == null)) {
119
120 throw new DocumentFormatException(
121 "GATE document is null or no content found. Nothing to parse!");
122 }// End if
123
124 // Create a status listener
125 StatusListener statusListener = new StatusListener() {
126 public void statusChanged(String text) {
127 // This is implemented in DocumentFormat.java and inherited here
128 fireStatusChanged(text);
129 }
130 };
131
132 boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
133
134 NekoHtmlDocumentHandler handler = null;
135 try {
136 org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
137
138 // convert element and attribute names to lower case
139 parser.setProperty("http://cyberneko.org/html/properties/names/elems",
140 "lower");
141 parser.setProperty("http://cyberneko.org/html/properties/names/attrs",
142 "lower");
143 // make parser augment infoset with location information
144 parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
145
146 // Create a new Xml document handler
147 handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
148 // Register a status listener with it
149 handler.addStatusListener(statusListener);
150 // set repositioning object
151 handler.setRepositioningInfo(repInfo);
152 // set the object with ampersand coding positions
153 handler.setAmpCodingInfo(ampCodingInfo);
154 // construct the list of offsets for each line of the document
155 int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
156 handler.setLineOffsets(lineOffsets);
157
158 // set the handlers
159 parser.setDocumentHandler(handler);
160 parser.setErrorHandler(handler);
161
162 // Parse the XML Document with the appropriate encoding
163 XMLInputSource is;
164
165 if(docHasContentButNoValidURL) {
166 // no URL, so parse from string
167 is =
168 new XMLInputSource(null, null, null, new StringReader(doc
169 .getContent().toString()), null);
170 }
171 else if(doc instanceof TextualDocument) {
172 // textual document - load with user specified encoding
173 String docEncoding = ((TextualDocument)doc).getEncoding();
174 // XML, so no BOM stripping.
175 Reader docReader =
176 new InputStreamReader(doc.getSourceUrl().openStream(),
177 docEncoding);
178 is =
179 new XMLInputSource(null, doc.getSourceUrl().toString(), doc
180 .getSourceUrl().toString(), docReader, docEncoding);
181
182 // since we control the encoding, tell the parser to ignore any
183 // meta http-equiv hints
184 parser
185 .setFeature(
186 "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
187 true);
188 }
189 else {
190 // let the parser decide the encoding
191 is =
192 new XMLInputSource(null, doc.getSourceUrl().toString(), doc
193 .getSourceUrl().toString());
194 }
195
196 /* The following line can forward an
197 * ArrayIndexOutOfBoundsException from
198 * org.cyberneko.html.HTMLConfiguration.parse and crash GATE. */
199 parser.parse(is);
200 // Angel - end
201 ((DocumentImpl)doc).setNextAnnotationId(handler.getCustomObjectsId());
202 }
203
204 /* Handle IOException specially. */
205 catch(IOException e) {
206 throw new DocumentFormatException("I/O exception for "
207 + doc.getSourceUrl().toString(), e);
208 }
209
210 /* Handle XNIException and ArrayIndexOutOfBoundsException:
211 * flag the parsing error and keep going. */
212 catch(Exception e) {
213 doc.getFeatures().put("parsingError", Boolean.TRUE);
214
215 Boolean bThrow =
216 (Boolean)doc.getFeatures().get(
217 GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
218
219 if(bThrow != null && bThrow.booleanValue()) {
220 // the next line is commented to avoid Document creation fail on
221 // error
222 throw new DocumentFormatException(e);
223 }
224 else {
225 Out.println("Warning: Document remains unparsed. \n"
226 + "\n Stack Dump: ");
227 e.printStackTrace(Out.getPrintWriter());
228 } // if
229
230 }
231 finally {
232 if(handler != null) handler.removeStatusListener(statusListener);
233 }// End if else try
234
235 }
236
237 /**
238 * Pattern that matches the beginning of every line in a multi-line
239 * string. The regular expression engine handles the different types
240 * of newline characters (\n, \r\n or \r) automatically.
241 */
242 private static Pattern afterNewlinePattern =
243 Pattern.compile("^", Pattern.MULTILINE);
244
245 /**
246 * Build an array giving the starting character offset of each line in
247 * the document. The HTML parser only reports event positions as line
248 * and column numbers, so we need this information to be able to
249 * correctly infer the repositioning information.
250 *
251 * @param docContent
252 * @return
253 */
254 private int[] buildLineOffsets(String docContent) {
255 Matcher m = afterNewlinePattern.matcher(docContent);
256 // we have to scan the text twice, first to determine how many lines
257 // there are (i.e. how long the array needs to be)...
258 int numMatches = 0;
259 while(m.find()) {
260 if(DEBUG) {
261 System.out.println("found line starting at offset " + m.start());
262 }
263 numMatches++;
264 }
265
266 int[] lineOffsets = new int[numMatches];
267
268 // ... and then again to populate the array with values.
269 m.reset();
270 for(int i = 0; i < lineOffsets.length; i++) {
271 m.find();
272 lineOffsets[i] = m.start();
273 }
274
275 return lineOffsets;
276 }
277
278 /** Initialise this resource, and return it. */
279 public Resource init() throws ResourceInstantiationException {
280 // Register HTML mime type
281 MimeType mime = new MimeType("text", "html");
282 // Register the class handler for this mime type
283 mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(),
284 this);
285 // Register the mime type with mine string
286 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
287 // Register file sufixes for this mime type
288 suffixes2mimeTypeMap.put("html", mime);
289 suffixes2mimeTypeMap.put("htm", mime);
290 // Register magic numbers for this mime type
291 magic2mimeTypeMap.put("<html", mime);
292 // Set the mimeType for this language resource
293 setMimeType(mime);
294 return this;
295 }// init()
296
297 }// class XmlDocumentFormat
|