001 /*
002 * TextualDocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 26/May/2000
013 *
014 * $Id: TextualDocumentFormat.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.corpora;
018
019 import java.io.IOException;
020
021 import gate.*;
022 import gate.creole.ResourceInstantiationException;
023 import gate.creole.metadata.AutoInstance;
024 import gate.creole.metadata.CreoleResource;
025 import gate.util.DocumentFormatException;
026
027 //import org.w3c.www.mime.*;
028
029 /** The format of Documents. Subclasses of DocumentFormat know about
030 * particular MIME types and how to unpack the information in any
031 * markup or formatting they contain into GATE annotations. Each MIME
032 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
033 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
034 * with a static index residing here when they are constructed. Static
035 * getDocumentFormat methods can then be used to get the appropriate
036 * format class for a particular document.
037 */
038 @CreoleResource(name = "GATE Textual Document Format", isPrivate = true,
039 autoinstances = {@AutoInstance(hidden = true)})
040 public class TextualDocumentFormat extends DocumentFormat
041 {
042
043 /** Debug flag */
044 private static final boolean DEBUG = false;
045
046 /** Default construction */
047 public TextualDocumentFormat() { super(); }
048
049 /** Initialise this resource, and return it. */
050 public Resource init() throws ResourceInstantiationException{
051 // Register plain text mime type
052 MimeType mime = new MimeType("text","plain");
053 // Register the class handler for this mime type
054 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
055 this);
056 // Register the mime type with mine string
057 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
058 // Register file sufixes for this mime type
059 suffixes2mimeTypeMap.put("txt",mime);
060 suffixes2mimeTypeMap.put("text",mime);
061 // Set the mimeType for this language resource
062 setMimeType(mime);
063 return this;
064 } // init()
065
066 /** Unpack the markup in the document. This converts markup from the
067 * native format (e.g. XML, RTF) into annotations in GATE format.
068 * Uses the markupElementsMap to determine which elements to convert, and
069 * what annotation type names to use.
070 */
071 public void unpackMarkup(Document doc) throws DocumentFormatException{
072 if (doc == null || doc.getContent() == null) return;
073 setNewLineProperty(doc);
074 // Create paragraph annotations in the specified annotation set
075 int endOffset = doc.getContent().toString().length();
076 int startOffset = 0;
077 annotateParagraphs(doc,startOffset,endOffset,
078 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
079 }//unpackMarkup
080
081 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
082 RepositioningInfo ampCodingInfo)
083 throws DocumentFormatException {
084 unpackMarkup(doc);
085 } // unpackMarkup
086
087 /**
088 * This is a test to see if the GATE document has a valid URL or a
089 * valid content.
090 *
091 * @param doc
092 * @throws DocumentFormatException
093 */
094 protected static boolean hasContentButNoValidUrl(Document doc)
095 throws DocumentFormatException {
096 try {
097 if(doc.getSourceUrl() == null && doc.getContent() != null) {
098 // The doc's url is null but there is a content.
099 return true;
100 }
101 else {
102 doc.getSourceUrl().openConnection();
103 }
104 }
105 catch(IOException ex1) {
106 // The URL is not null but is not valid.
107 if(doc.getContent() == null)
108 // The document content is also null. There is nothing we can do.
109 throw new DocumentFormatException("The document doesn't have a"
110 + " valid URL and also no content");
111 return true;
112 }// End try
113
114 return false;
115 }
116
117
118 /**
119 * Check the new line sequence and set document property.
120 * <BR>
121 * Possible values are CRLF, LFCR, CR, LF
122 */
123 protected void setNewLineProperty(Document doc) {
124 String content = doc.getContent().toString();
125 String newLineType = "";
126
127 char ch = ' ';
128 char lastch = ' ';
129 for(int i=0; i < content.length(); ++i) {
130 ch = content.charAt(i);
131 if(lastch == '\r') {
132 if(ch == '\n') {
133 newLineType = "CRLF";
134 break;
135 }
136 else {
137 newLineType = "CR";
138 break;
139 }
140 }
141 if(lastch == '\n') {
142 if(ch == '\r') {
143 newLineType = "LFCR";
144 break;
145 }
146 else {
147 newLineType = "LF";
148 break;
149 }
150 }
151 lastch = ch;
152 } // for
153
154 doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
155 } // setNewLineProperty()
156
157 /** Delete '\r' in combination CRLF or LFCR in document content */
158 private void removeExtraNewLine(Document doc) {
159 String content = doc.getContent().toString();
160 StringBuffer buff = new StringBuffer(content);
161
162 char ch = ' ';
163 char lastch = ' ';
164 for(int i=content.length()-1; i > -1; --i) {
165 ch = content.charAt(i);
166 if(ch == '\n' && lastch == '\r') {
167 buff.deleteCharAt(i+1);
168 }
169 if(ch == '\r' && lastch == '\n') {
170 buff.deleteCharAt(i);
171 ch = lastch;
172 }
173 lastch = ch;
174 } // for
175
176 doc.setContent(new DocumentContentImpl(buff.toString()));
177 } // removeExtraNewLine(Document doc)
178
179 /** This method annotates paragraphs in a GATE document. The investigated text
180 * spans beetween start and end offsets and the paragraph annotations are
181 * created in the annotSetName. If annotSetName is null then they are creted
182 * in the default annotation set.
183 * @param aDoc is the gate document on which the paragraph detection would
184 * be performed.If it is null or its content it's null then the method woul
185 * simply return doing nothing.
186 * @param startOffset is the index form the document content from which the
187 * paragraph detection will start
188 * @param endOffset is the offset where the detection will end.
189 * @param annotSetName is the name of the set in which paragraph annotation
190 * would be created.The annotation type created will be "paragraph"
191 */
192 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
193 String annotSetName)throws DocumentFormatException{
194 // Simply return if the document is null or its content
195 if (aDoc == null || aDoc.getContent() == null) return;
196 // Simply return if the start is > than the end
197 if (startOffset > endOffset) return;
198 // Decide where to put the newly detected annotations
199 AnnotationSet annotSet = null;
200 if (annotSetName == null)
201 annotSet = aDoc.getAnnotations();
202 else
203 annotSet = aDoc.getAnnotations(annotSetName);
204 // Extract the document content
205 String content = aDoc.getContent().toString();
206 // This is the offset marking the start of a para
207 int startOffsetPara = startOffset;
208 // This marks the ned of a para
209 int endOffsetPara = endOffset;
210 // The initial sate of the FSA
211 int state = 1;
212 // This field marks that a BR entity was read
213 // A BR entity can be NL or NL CR, depending on the operating system (UNIX
214 // or DOS)
215 boolean readBR = false;
216 int index = startOffset;
217 while (index < endOffset){
218 // Read the current char
219 char ch = content.charAt(index);
220 // Test if a BR entity was read
221 if (ch =='\n'){
222 readBR = true;
223 // If \n is followed by a \r then advance the index in order to read a
224 // BR entity
225 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
226 index ++;
227 }// End if
228 switch(state){
229 // It is the initial and also a final state
230 // Stay in state 1 while it reads whitespaces
231 case 1:{
232 // If reads a non whitespace char then move to state 2 and record
233 // the beggining of a paragraph
234 if (!Character.isWhitespace(ch)){
235 state = 2;
236 startOffsetPara = index;
237 }// End if
238 }break;
239 // It can be also a final state.
240 case 2:{
241 // Stay in state 2 while reading chars != BR entities
242 if (readBR){
243 // If you find a BR char go to state 3. The possible end of the para
244 // can be index. This will be confirmed by state 3. So, this is why
245 // the end of a para is recorded here.
246 readBR = false;
247 endOffsetPara = index;
248 state = 3;
249 }// End if
250 }break;
251 // It can be also a final state
252 // From state 3 there are only 2 possible ways: (state 2 or state1)
253 // In state 1 it needs to read a BR
254 // For state 2 it nead to read something different then a BR
255 case 3:{
256 if (readBR){
257 // A BR was read. Go to state 1
258 readBR = false;
259 state = 1;
260 // Create an annotation type paragraph
261 try{
262 annotSet.add( new Long(startOffsetPara),
263 new Long(endOffsetPara),
264 "paragraph",
265 Factory.newFeatureMap());
266 } catch (gate.util.InvalidOffsetException ioe){
267 throw new DocumentFormatException("Coudn't create a paragraph"+
268 " annotation",ioe);
269 }// End try
270 }else{
271 // Go to state 2 an keep reading chars
272 state = 2;
273 }// End if
274 }break;
275 }// End switch
276 // Prepare to read the next char.
277 index ++;
278 }// End while
279 endOffsetPara = index;
280 // Investigate where the finite automata has stoped
281 if ( state==2 || state==3 ){
282 // Create an annotation type paragraph
283 try{
284 annotSet.add( new Long(startOffsetPara),
285 // Create the final annotation using the endOffset
286 new Long(endOffsetPara),
287 "paragraph",
288 Factory.newFeatureMap());
289 } catch (gate.util.InvalidOffsetException ioe){
290 throw new DocumentFormatException("Coudn't create a paragraph"+
291 " annotation",ioe);
292 }// End try
293 }// End if
294 }// End annotateParagraphs();
295
296 public DataStore getDataStore(){ return null;}
297
298 } // class TextualDocumentFormat
|