001 /*
002 * SgmlDocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 4/July/2000
013 *
014 * $Id: SgmlDocumentFormat.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.corpora;
018
019 import java.io.IOException;
020
021 import javax.xml.parsers.*;
022
023 import org.xml.sax.SAXException;
024
025 import gate.Document;
026 import gate.Resource;
027 import gate.creole.ResourceInstantiationException;
028 import gate.creole.metadata.AutoInstance;
029 import gate.creole.metadata.CreoleResource;
030 import gate.event.StatusListener;
031 import gate.sgml.Sgml2Xml;
032 import gate.util.DocumentFormatException;
033 import gate.xml.XmlDocumentHandler;
034
035 /** The format of Documents. Subclasses of DocumentFormat know about
036 * particular MIME types and how to unpack the information in any
037 * markup or formatting they contain into GATE annotations. Each MIME
038 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
039 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
040 * with a static index residing here when they are constructed. Static
041 * getDocumentFormat methods can then be used to get the appropriate
042 * format class for a particular document.
043 */
044 @CreoleResource(name = "GATE SGML Document Format", isPrivate = true,
045 autoinstances = {@AutoInstance(hidden = true)})
046 public class SgmlDocumentFormat extends TextualDocumentFormat
047 {
048 /** Debug flag */
049 private static final boolean DEBUG = false;
050
051 /** Default construction */
052 public SgmlDocumentFormat() { super(); }
053
054 /** Unpack the markup in the document. This converts markup from the
055 * native format (e.g. SGML) into annotations in GATE format.
056 * Uses the markupElementsMap to determine which elements to convert, and
057 * what annotation type names to use.
058 * The doc's content is first converted to a wel formed XML.
059 * If this succeddes then the document is saved into a temp file and parsed
060 * as an XML document.
061 *
062 * @param doc The gate document you want to parse.
063 *
064 */
065 public void unpackMarkup(Document doc) throws DocumentFormatException{
066 if ( (doc == null) ||
067 (doc.getSourceUrl() == null && doc.getContent() == null)){
068
069 throw new DocumentFormatException(
070 "GATE document is null or no content found. Nothing to parse!");
071 }// End if
072 // Create a status listener
073 StatusListener statusListener = new StatusListener(){
074 public void statusChanged(String text){
075 fireStatusChanged(text);
076 }
077 };
078 XmlDocumentHandler xmlDocHandler = null;
079 try {
080 Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
081
082 fireStatusChanged("Performing SGML to XML...");
083
084 // convert the SGML document
085 String xmlUri = sgml2Xml.convert();
086
087 fireStatusChanged("DONE !");
088
089 //Out.println("Conversion done..." + xmlUri);
090 //Out.println(sgml2Xml.convert());
091 // Get a parser factory.
092 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
093 // Set up the factory to create the appropriate type of parser
094
095 // Set up the factory to create the appropriate type of parser
096 // non validating one
097 saxParserFactory.setValidating(false);
098 // non namesapace aware one
099 saxParserFactory.setNamespaceAware(true);
100
101 // Create a SAX parser
102 SAXParser parser = saxParserFactory.newSAXParser();
103
104 // use it
105 if (null != doc){
106 // create a new Xml document handler
107 xmlDocHandler = new XmlDocumentHandler(doc,
108 this.markupElementsMap,
109 this.element2StringMap);
110
111 // register a status listener with it
112 xmlDocHandler.addStatusListener(statusListener);
113
114 parser.parse(xmlUri, xmlDocHandler);
115 ((DocumentImpl) doc).setNextAnnotationId(
116 xmlDocHandler.getCustomObjectsId());
117 }// end if
118 } catch (ParserConfigurationException e){
119 throw
120 new DocumentFormatException("XML parser configuration exception ", e);
121 } catch (SAXException e){
122 throw new DocumentFormatException(e);
123 } catch (IOException e){
124 throw new DocumentFormatException("I/O exception for " +
125 doc.getSourceUrl().toString());
126 }finally{
127 if (xmlDocHandler != null)
128 xmlDocHandler.removeStatusListener(statusListener);
129 }// End try
130
131 }// unpackMarkup
132
133 /** This method converts the document's content from SGML 2 XML.*/
134 private String sgml2Xml(Document doc) {
135 String xmlUri = doc.getSourceUrl().toString ();
136
137 return xmlUri;
138 }// sgml2Xml()
139
140 /** Initialise this resource, and return it. */
141 public Resource init() throws ResourceInstantiationException{
142 // Register SGML mime type
143 MimeType mime = new MimeType("text","sgml");
144 // Register the class handler for this mime type
145 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
146 this);
147 // Register the mime type with mine string
148 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
149 // Register file sufixes for this mime type
150 suffixes2mimeTypeMap.put("sgm",mime);
151 suffixes2mimeTypeMap.put("sgml",mime);
152 setMimeType(mime);
153 return this;
154 }// init
155
156 }//class SgmlDocumentFormat
|