001 /*
002 * XmlDocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 26/May/2000
013 *
014 * $Id: XmlDocumentFormat.java 12919 2010-08-03 10:31:37Z valyt $
015 */
016
017 package gate.corpora;
018
019 // import com.sun.xml.parser.* ;
020 import gate.Document;
021 import gate.GateConstants;
022 import gate.Resource;
023 import gate.TextualDocument;
024 import gate.creole.ResourceInstantiationException;
025 import gate.creole.metadata.AutoInstance;
026 import gate.creole.metadata.CreoleResource;
027 import gate.event.StatusListener;
028 import gate.util.DocumentFormatException;
029 import gate.util.Out;
030 import gate.xml.XmlDocumentHandler;
031
032 import java.io.IOException;
033 import java.io.InputStream;
034 import java.io.InputStreamReader;
035 import java.io.Reader;
036 import java.io.StringReader;
037
038 import javax.xml.parsers.ParserConfigurationException;
039 import javax.xml.parsers.SAXParser;
040 import javax.xml.parsers.SAXParserFactory;
041 import javax.xml.stream.XMLInputFactory;
042 import javax.xml.stream.XMLStreamException;
043 import javax.xml.stream.XMLStreamReader;
044
045 import org.xml.sax.InputSource;
046 import org.xml.sax.SAXException;
047
048 // import org.w3c.www.mime.*;
049
050 /**
051 * The format of Documents. Subclasses of DocumentFormat know about
052 * particular MIME types and how to unpack the information in any markup
053 * or formatting they contain into GATE annotations. Each MIME type has
054 * its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
055 * RtfDocumentFormat, MpegDocumentFormat. These classes register
056 * themselves with a static index residing here when they are
057 * constructed. Static getDocumentFormat methods can then be used to get
058 * the appropriate format class for a particular document.
059 */
060 @CreoleResource(name = "GATE XML Document Format", isPrivate = true,
061 autoinstances = {@AutoInstance(hidden = true)})
062 public class XmlDocumentFormat extends TextualDocumentFormat {
063 /** Debug flag */
064 private static final boolean DEBUG = false;
065
066 /**
067 * InputFactory for the StAX parser used for GATE format XML.
068 */
069 private static XMLInputFactory staxFactory;
070
071 /** Default construction */
072 public XmlDocumentFormat() {
073 super();
074 }
075
076 /** We could collect repositioning information during XML parsing */
077 public Boolean supportsRepositioning() {
078 return new Boolean(true);
079 } // supportsRepositioning
080
081 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
082 public void unpackMarkup(Document doc) throws DocumentFormatException {
083 unpackMarkup(doc, (RepositioningInfo)null, (RepositioningInfo)null);
084 } // unpackMarkup
085
086 /**
087 * Unpack the markup in the document. This converts markup from the
088 * native format (e.g. XML) into annotations in GATE format. Uses the
089 * markupElementsMap to determine which elements to convert, and what
090 * annotation type names to use. If the document was created from a
091 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
092 * So, if the document has a valid URL, then the parser will try to
093 * parse the XML document pointed by the URL.If the URL is not valid,
094 * or is null, then the doc's content will be parsed. If the doc's
095 * content is not a valid XML then the parser might crash.
096 *
097 * @param doc The gate document you want to parse. If
098 * <code>doc.getSourceUrl()</code> returns <b>null</b>
099 * then the content of doc will be parsed. Using a URL is
100 * recomended because the parser will report errors corectlly
101 * if the XML document is not well formed.
102 */
103 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
104 RepositioningInfo ampCodingInfo) throws DocumentFormatException {
105 if((doc == null)
106 || (doc.getSourceUrl() == null && doc.getContent() == null)) {
107
108 throw new DocumentFormatException(
109 "GATE document is null or no content found. Nothing to parse!");
110 }// End if
111
112 // Create a status listener
113 StatusListener statusListener = new StatusListener() {
114 public void statusChanged(String text) {
115 // This is implemented in DocumentFormat.java and inherited here
116 fireStatusChanged(text);
117 }
118 };
119
120 // determine whether we have a GATE format XML document or another
121 // kind
122 String content = doc.getContent().toString();
123 if(content.length() > 2048) {
124 content = content.substring(0, 2048);
125 }
126 boolean gateFormat = isGateXmlFormat(content);
127
128 if(gateFormat) {
129 unpackGateFormatMarkup(doc, statusListener);
130 }
131 else {
132 unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
133 }
134 }
135
136 /**
137 * Unpacks markup in the GATE-specific standoff XML markup format.
138 *
139 * @param doc the document to process
140 * @param statusListener optional status listener to receive status
141 * messages
142 * @throws DocumentFormatException if a fatal error occurs during
143 * parsing
144 */
145 private void unpackGateFormatMarkup(Document doc,
146 StatusListener statusListener) throws DocumentFormatException {
147 boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
148
149 try {
150 Reader inputReader = null;
151 InputStream inputStream = null;
152 XMLStreamReader xsr = null;
153 if(docHasContentButNoValidURL) {
154 inputReader = new StringReader(doc.getContent().toString());
155 xsr = getInputFactory().createXMLStreamReader(inputReader);
156 }
157 else if(doc instanceof TextualDocument) {
158 String encoding = ((TextualDocument)doc).getEncoding();
159 // Don't strip BOM on XML.
160 inputReader = new InputStreamReader(doc.getSourceUrl().openStream(),
161 encoding);
162 // create stream reader with the URL as system ID, to support
163 // relative URLs to e.g. DTD or external entities
164 xsr = getInputFactory().createXMLStreamReader(
165 doc.getSourceUrl().toExternalForm(), inputReader);
166 }
167 else {
168 // not a TextualDocument, so let parser determine encoding
169 inputStream = doc.getSourceUrl().openStream();
170 xsr = getInputFactory().createXMLStreamReader(
171 doc.getSourceUrl().toExternalForm(), inputStream);
172 }
173
174 // find the opening GateDocument tag
175 xsr.nextTag();
176
177 // parse the document
178 try {
179 DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
180 }
181 finally {
182 xsr.close();
183 if(inputStream != null) {
184 inputStream.close();
185 }
186 if(inputReader != null) {
187 inputReader.close();
188 }
189 }
190 }
191 catch(XMLStreamException e) {
192 doc.getFeatures().put("parsingError", Boolean.TRUE);
193
194 Boolean bThrow = (Boolean)doc.getFeatures().get(
195 GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
196
197 if(bThrow != null && bThrow.booleanValue()) {
198 // the next line is commented to avoid Document creation fail on
199 // error
200 throw new DocumentFormatException(e);
201 }
202 else {
203 Out.println("Warning: Document remains unparsed. \n"
204 + "\n Stack Dump: ");
205 e.printStackTrace(Out.getPrintWriter());
206 } // if
207 }
208 catch(IOException ioe) {
209 throw new DocumentFormatException("I/O exception for "
210 + doc.getSourceUrl().toString(), ioe);
211 }
212 }
213
214 /**
215 * Returns the StAX input factory, creating one if it is currently
216 * null.
217 *
218 * @return <code>staxFactory</code>
219 * @throws XMLStreamException
220 */
221 private static XMLInputFactory getInputFactory() throws XMLStreamException {
222 if(staxFactory == null) {
223 staxFactory = XMLInputFactory.newInstance();
224 staxFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
225 staxFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
226 staxFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,
227 Boolean.TRUE);
228 staxFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES,
229 Boolean.TRUE);
230 }
231 return staxFactory;
232 }
233
234 /**
235 * Unpack markup from any XML format. The XML elements are translated
236 * to annotations on the Original markups annotation set.
237 *
238 * @param doc the document to process
239 * @throws DocumentFormatException
240 */
241 private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo,
242 RepositioningInfo ampCodingInfo, StatusListener statusListener)
243 throws DocumentFormatException {
244 boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
245
246 XmlDocumentHandler xmlDocHandler = null;
247 try {
248 // use Xerces XML parser with JAXP
249 // System.setProperty("javax.xml.parsers.SAXParserFactory",
250 // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
251 // Get a parser factory.
252 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
253 // Set up the factory to create the appropriate type of parser
254 // non validating one
255 saxParserFactory.setValidating(false);
256 // non namesapace aware one
257 saxParserFactory.setNamespaceAware(true);
258 // create it
259 SAXParser xmlParser = saxParserFactory.newSAXParser();
260
261 // Create a new Xml document handler
262 xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap,
263 this.element2StringMap);
264 // Register a status listener with it
265 xmlDocHandler.addStatusListener(statusListener);
266 // set repositioning object
267 xmlDocHandler.setRepositioningInfo(repInfo);
268 // set the object with ampersand coding positions
269 xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
270
271 org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
272 // Set up the factory to create the appropriate type of parser
273 // non validating one
274 // http://xml.org/sax/features/validation set to false
275 newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
276 // namesapace aware one
277 // http://xml.org/sax/features/namespaces set to true
278 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
279 newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes",
280 true);
281 newxmlParser.setContentHandler(xmlDocHandler);
282 newxmlParser.setErrorHandler(xmlDocHandler);
283 newxmlParser.setDTDHandler(xmlDocHandler);
284 newxmlParser.setEntityResolver(xmlDocHandler);
285 // Parse the XML Document with the appropriate encoding
286 Reader docReader = null;
287 try{
288 InputSource is;
289 if(docHasContentButNoValidURL) {
290 // no URL, so parse from string
291 is = new InputSource(new StringReader(doc.getContent().toString()));
292 }
293 else if(doc instanceof TextualDocument) {
294 // textual document - load with user specified encoding
295 String docEncoding = ((TextualDocument)doc).getEncoding();
296 // don't strip BOM on XML.
297 docReader = new InputStreamReader(doc.getSourceUrl()
298 .openStream(), docEncoding);
299 is = new InputSource(docReader);
300 // must set system ID to allow relative URLs (e.g. to a DTD) to
301 // work
302 is.setSystemId(doc.getSourceUrl().toString());
303 }
304 else {
305 // let the parser decide the encoding
306 is = new InputSource(doc.getSourceUrl().toString());
307 }
308 newxmlParser.parse(is);
309 }finally{
310 //make sure the open streams are closed
311 if(docReader != null) docReader.close();
312 }
313 // Angel - end
314 ((DocumentImpl)doc).setNextAnnotationId(xmlDocHandler
315 .getCustomObjectsId());
316 }
317 catch(ParserConfigurationException e) {
318 throw new DocumentFormatException("XML parser configuration exception ",
319 e);
320 }
321 catch(SAXException e) {
322 doc.getFeatures().put("parsingError", Boolean.TRUE);
323
324 Boolean bThrow = (Boolean)doc.getFeatures().get(
325 GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
326
327 if(bThrow != null && bThrow.booleanValue()) {
328 // the next line is commented to avoid Document creation fail on
329 // error
330 throw new DocumentFormatException(e);
331 }
332 else {
333 Out.println("Warning: Document remains unparsed. \n"
334 + "\n Stack Dump: ");
335 e.printStackTrace(Out.getPrintWriter());
336 } // if
337
338 }
339 catch(IOException e) {
340 throw new DocumentFormatException("I/O exception for "
341 + doc.getSourceUrl().toString(), e);
342 }
343 finally {
344 if(xmlDocHandler != null)
345 xmlDocHandler.removeStatusListener(statusListener);
346 }// End if else try
347 }// unpackMarkup
348
349 /**
350 * Determine whether the given document content string represents a
351 * GATE custom format XML document.
352 */
353 protected static boolean isGateXmlFormat(String content) {
354 return (content.indexOf("<GateDocument") != -1 || content
355 .indexOf(" GateDocument") != -1);
356 }
357
358 /** Initialise this resource, and return it. */
359 public Resource init() throws ResourceInstantiationException {
360 // Register XML mime type
361 MimeType mime = new MimeType("text", "xml");
362 // Register the class handler for this mime type
363 mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(),
364 this);
365 // Register the mime type with mine string
366 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
367 // sometimes XML file appear as application/xml
368 mimeString2mimeTypeMap.put("application/xml", mime);
369 // Register file sufixes for this mime type
370 suffixes2mimeTypeMap.put("xml", mime);
371 suffixes2mimeTypeMap.put("xhtm", mime);
372 suffixes2mimeTypeMap.put("xhtml", mime);
373 // Register magic numbers for this mime type
374 magic2mimeTypeMap.put("<?xml", mime);
375 // Set the mimeType for this language resource
376 setMimeType(mime);
377 return this;
378 }// init()
379
380 }// class XmlDocumentFormat
|