001 /*
002 * DocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Hamish Cunningham, 25/May/2000
013 *
014 * $Id: DocumentFormat.java 13532 2011-03-15 09:15:35Z markagreenwood $
015 */
016
017 package gate;
018
019 import java.io.*;
020 import java.net.URL;
021 import java.util.*;
022
023 import org.apache.commons.io.IOUtils;
024
025 import gate.corpora.MimeType;
026 import gate.corpora.RepositioningInfo;
027 import gate.creole.AbstractLanguageResource;
028 import gate.event.StatusListener;
029 import gate.util.BomStrippingInputStreamReader;
030 import gate.util.DocumentFormatException;
031
032 /** The format of Documents. Subclasses of DocumentFormat know about
033 * particular MIME types and how to unpack the information in any
034 * markup or formatting they contain into GATE annotations. Each MIME
035 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
036 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
037 * with a static index residing here when they are constructed. Static
038 * getDocumentFormat methods can then be used to get the appropriate
039 * format class for a particular document.
040 */
041 public abstract class DocumentFormat
042 extends AbstractLanguageResource implements LanguageResource{
043 /** Debug flag */
044 private static final boolean DEBUG = false;
045
046 /** The MIME type of this format. */
047 private MimeType mimeType = null;
048
049 /** Map of MimeTypeString to ClassHandler class. This is used to find the
050 * language resource that deals with the specific Document format
051 */
052 protected static Map<String, DocumentFormat>
053 mimeString2ClassHandlerMap = new HashMap();
054 /** Map of MimeType to DocumentFormat Class. This is used to find the
055 * DocumentFormat subclass that deals with a particular MIME type.
056 */
057 protected static Map<String, MimeType>
058 mimeString2mimeTypeMap = new HashMap();
059
060 /** Map of Set of file suffixes to MimeType. This is used to figure
061 * out what MIME type a document is from its file name.
062 */
063 protected static Map<String, MimeType>
064 suffixes2mimeTypeMap = new HashMap();
065
066 /** Map of Set of magic numbers to MimeType. This is used to guess the
067 * MIME type of a document, when we don't have any other clues.
068 */
069 protected static Map<String, MimeType>
070 magic2mimeTypeMap = new HashMap();
071
072 /** Map of markup elements to annotation types. If it is null, the
073 * unpackMarkup() method will convert all markup, using the element names
074 * for annotation types. If it is non-null, only those elements specified
075 * here will be converted.
076 */
077 protected Map markupElementsMap = null;
078
079 /** This map is used inside uppackMarkup() method...
080 * When an element from the map is encounted, The corresponding string
081 * element is added to the document content
082 */
083 protected Map element2StringMap = null;
084
085 /** The features of this resource */
086 private FeatureMap features = null;
087
088 /** Default construction */
089 public DocumentFormat() {}
090
091 /** listeners for status report */
092 private transient Vector statusListeners;
093
094 /** Flag for enable/disable collecting of repositioning information */
095 private Boolean shouldCollectRepositioning = new Boolean(false);
096
097 /** If the document format could collect repositioning information
098 * during the unpack phase this method will return <B>true</B>.
099 * <BR>
100 * You should override this method in the child class of the defined
101 * document format if it could collect the repositioning information.
102 */
103 public Boolean supportsRepositioning() {
104 return new Boolean(false);
105 } // supportsRepositioning
106
107 public void setShouldCollectRepositioning(Boolean b) {
108 if(supportsRepositioning().booleanValue() && b.booleanValue()) {
109 shouldCollectRepositioning = b;
110 }
111 else {
112 shouldCollectRepositioning = new Boolean(false);
113 } // if
114 } // setShouldCollectRepositioning
115
116 public Boolean getShouldCollectRepositioning() {
117 return shouldCollectRepositioning;
118 } //
119
120 /** Unpack the markup in the document. This converts markup from the
121 * native format (e.g. XML, RTF) into annotations in GATE format.
122 * Uses the markupElementsMap to determine which elements to convert, and
123 * what annotation type names to use.
124 */
125 abstract public void unpackMarkup(Document doc)
126 throws DocumentFormatException;
127
128 abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
129 RepositioningInfo ampCodingInfo)
130 throws DocumentFormatException;
131 /** Unpack the markup in the document. This method calls unpackMarkup on the
132 * GATE document, but after it saves its content as a feature atached to
133 * the document. This method is usefull if one wants to save the content
134 * of the document being unpacked. After the markups have been unpacked,
135 * the content of the document will be replaced with a new one containing
136 * the text between markups.
137 *
138 * @param doc the document that will be upacked
139 * @param originalContentFeatureType the name of the feature that will hold
140 * the document's content.
141 */
142 public void unpackMarkup( Document doc,
143 String originalContentFeatureType )
144 throws DocumentFormatException{
145 FeatureMap fm = doc.getFeatures();
146 if (fm == null) fm = Factory.newFeatureMap();
147 fm.put(originalContentFeatureType, doc.getContent().toString());
148 doc.setFeatures(fm);
149 unpackMarkup(doc);
150 }// unpackMarkup();
151
152 /**
153 * Returns a MimeType having as input a fileSufix.
154 * If the file sufix is <b>null</b> or not recognised then,
155 * <b>null</b> will be returned.
156 * @param fileSufix The file sufix associated with a recognisabe mime type.
157 * @return The MimeType associated with this file suffix.
158 */
159 static private MimeType getMimeType(String fileSufix){
160 // Get a mimeType string associated with this fileSuffix
161 // Eg: for html returns MimeType("text/html"), for xml returns
162 // MimeType("text/xml")
163 if(fileSufix == null) return null;
164 return suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
165 }//getMimeType
166
167 /**
168 * Returns a MymeType having as input a URL object. If the MimeType wasn't
169 * recognized it returns <b>null</b>.
170 * @param url The URL object from which the MimeType will be extracted
171 * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is
172 * unknown.
173 */
174 static private MimeType getMimeType(URL url) {
175 String mimeTypeString = null;
176 String charsetFromWebServer = null;
177 String contentType = null;
178 InputStream is = null;
179 MimeType mimeTypeFromWebServer = null;
180 MimeType mimeTypeFromFileSuffix = null;
181 MimeType mimeTypeFromMagicNumbers = null;
182 String fileSufix = null;
183
184 if (url == null)
185 return null;
186 // Ask the web server for the content type
187 // We expect to get contentType something like this:
188 // "text/html; charset=iso-8859-1"
189 // Charset is optional
190
191 try {
192 try{
193 is = url.openConnection().getInputStream();
194 contentType = url.openConnection().getContentType();
195 } catch (IOException e){
196 // Failed to get the content type with te Web server.
197 // Let's try some other methods like FileSuffix or magic numbers.
198 }
199 // If a content Type was returned by the server, try to get the mime Type
200 // string
201 // If contentType is something like this:"text/html; charset=iso-8859-1"
202 // try to get content Type string (text/html)
203 if (contentType != null){
204 StringTokenizer st = new StringTokenizer(contentType, ";");
205 // We assume that the first token is the mime type string...
206 // If this doesn't happen then BAD LUCK :(( ...
207 if (st.hasMoreTokens())
208 mimeTypeString = st.nextToken().toLowerCase();
209 // The next token it should be the CharSet
210 if (st.hasMoreTokens())
211 charsetFromWebServer = st.nextToken().toLowerCase();
212 if (charsetFromWebServer != null){
213 //We have something like : "charset=iso-8859-1" and let's extract the
214 // encoding.
215 st = new StringTokenizer(charsetFromWebServer, "=");
216 // Don't need this anymore
217 charsetFromWebServer = null;
218 // Discarding the first token which is : "charset"
219 if (st.hasMoreTokens())
220 st.nextToken();
221 // Get the encoding : "ISO-8859-1"
222 if (st.hasMoreTokens())
223 charsetFromWebServer = st.nextToken().toUpperCase();
224 } // End if
225 }// end if
226 // Return the corresponding MimeType with WebServer from the associated MAP
227 mimeTypeFromWebServer = mimeString2mimeTypeMap.get(mimeTypeString);
228 // Let's try a file suffix detection
229 // Get the file sufix from the URL.See method definition for more details
230 fileSufix = getFileSufix(url);
231 // Get the mime type based on the on file sufix
232 mimeTypeFromFileSuffix = getMimeType(fileSufix);
233
234 // Let's perform a magic numbers guess..
235 mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
236 charsetFromWebServer);
237 }
238 finally {
239 IOUtils.closeQuietly(is); //null safe
240 }
241 //All those types enter into a deciding system
242 return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
243 mimeTypeFromFileSuffix,
244 mimeTypeFromMagicNumbers);
245 }//getMimeType
246
247 /**
248 * This method decides what mimeType is in majority
249 * @param aMimeTypeFromWebServer a MimeType
250 * @param aMimeTypeFromFileSuffix a MimeType
251 * @param aMimeTypeFromMagicNumbers a MimeType
252 * @return the MimeType which occurs most. If all are null, then returns
253 * <b>null</b>
254 */
255 protected static MimeType decideBetweenThreeMimeTypes(
256 MimeType aMimeTypeFromWebServer,
257 MimeType aMimeTypeFromFileSuffix,
258 MimeType aMimeTypeFromMagicNumbers){
259
260 // First a voting system
261 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
262 return aMimeTypeFromFileSuffix;
263 if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
264 return aMimeTypeFromFileSuffix;
265 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
266 return aMimeTypeFromWebServer;
267
268 // 1 is the highest priority
269 if (aMimeTypeFromFileSuffix != null)
270 aMimeTypeFromFileSuffix.addParameter("Priority","1");
271 // 2 is the second priority
272 if (aMimeTypeFromWebServer != null)
273 aMimeTypeFromWebServer.addParameter("Priority","2");
274 // 3 is the third priority
275 if (aMimeTypeFromMagicNumbers != null)
276 aMimeTypeFromMagicNumbers.addParameter("Priority","3");
277
278 return decideBetweenTwoMimeTypes(
279 decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
280 aMimeTypeFromFileSuffix),
281 aMimeTypeFromMagicNumbers);
282
283 }// decideBetweenThreeMimeTypes
284
285 /** Decide between two mimeTypes. The decistion is made on "Priority"
286 * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
287 * doesn't have "Priority" paramether set, it will return one on them.
288 * @param aMimeType a MimeType object with "Prority" parameter set
289 * @param anotherMimeType a MimeType object with "Prority" parameter set
290 * @return One of the two mime types.
291 */
292 protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
293 MimeType anotherMimeType){
294 if (aMimeType == null) return anotherMimeType;
295 if (anotherMimeType == null) return aMimeType;
296
297 int priority1 = 0;
298 int priority2 = 0;
299 // Both of them are not null
300 if (aMimeType.hasParameter("Priority"))
301 try{
302 priority1 =
303 new Integer(aMimeType.getParameterValue("Priority")).intValue();
304 }catch (NumberFormatException e){
305 return anotherMimeType;
306 }
307 if (anotherMimeType.hasParameter("Priority"))
308 try{
309 priority2 =
310 new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
311 }catch (NumberFormatException e){
312 return aMimeType;
313 }
314
315 // The lower the number, the highest the priority
316 if (priority1 <= priority2)
317 return aMimeType;
318 else
319 return anotherMimeType;
320 }// decideBetweenTwoMimeTypes
321
322 /**
323 * Tests if two MimeType objects are equal.
324 * @return true only if boths MimeType objects are different than <b>null</b>
325 * and their Types and Subtypes are equals. The method is case sensitive.
326 */
327 protected static boolean areEqual( MimeType aMimeType,
328 MimeType anotherMimeType){
329 if (aMimeType == null || anotherMimeType == null)
330 return false;
331
332 if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
333 aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
334 ) return true;
335 else
336 return false;
337 }// are Equal
338
339 /**
340 * This method tries to guess the mime Type using some magic numbers.
341 * @param aInputStream a InputStream which has to be transformed into a
342 * InputStreamReader
343 * @param anEncoding the encoding. If is null or unknown then a
344 * InputStreamReader with default encodings will be created.
345 * @return the mime type associated with magic numbers
346 */
347 protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
348 String anEncoding){
349
350 if (aInputStream == null) return null;
351 Reader reader = null;
352 if (anEncoding != null)
353 try{
354 reader = new BomStrippingInputStreamReader(aInputStream, anEncoding);
355 } catch (UnsupportedEncodingException e){
356 reader = null;
357 }
358 if (reader == null)
359 // Create a reader with the default encoding system
360 reader = new BomStrippingInputStreamReader(aInputStream);
361
362 // We have a input stream reader
363 return runMagicNumbers(reader);
364 }//guessTypeUsingMagicNumbers
365
366 /** Performs magic over Gate Document */
367 protected static MimeType runMagicNumbers(Reader aReader) {
368 // No reader, nothing to detect
369 if( aReader == null) return null;
370
371 // Prepare to run the magic stuff
372 String strBuffer = null;
373 int bufferSize = 2048;
374 int charReads = 0;
375 char[] cbuf = new char[bufferSize];
376
377 try {
378 charReads = aReader.read(cbuf,0,bufferSize);
379 } catch (IOException e){
380 return null;
381 }// End try
382
383 if (charReads == -1)
384 // the document is empty
385 return null;
386
387 // Create a string form the buffer and perform some search on it.
388 strBuffer = new String(cbuf,0,charReads);
389
390 // If this fails then surrender
391 return getTypeFromContent(strBuffer);
392 }// runMagicNumbers
393
394 private static MimeType getTypeFromContent(String aContent){
395 MimeType detectedMimeType = null;
396 // Detect whether or not is a GateXmlDocument
397 // ian_roberts - moved to XmlDocumentFormat where it belongs
398 //if ( aContent.indexOf("<GateDocument") != -1 ||
399 // aContent.indexOf(" GateDocument") != -1)
400 // isGateXmlDocument = true;
401 //else
402 // isGateXmlDocument = false;
403
404 // Run the magic numbers test
405 Set<String> magicSet = magic2mimeTypeMap.keySet();
406 Iterator<String> iterator=magicSet.iterator();
407 String magic;
408 // change case to cover more variants
409 aContent = aContent.toLowerCase();
410 while (iterator.hasNext()){
411 magic = iterator.next().toLowerCase();
412 if (aContent.indexOf(magic) != -1)
413 detectedMimeType = magic2mimeTypeMap.get(magic);
414 }// End while
415
416 // If this fails then surrender
417 return detectedMimeType;
418 }// getTypeFromContent
419
420 /**
421 * Return the fileSuffix or null if the url doesn't have a file suffix
422 * If the url is null then the file suffix will be null also
423 */
424 private static String getFileSufix(URL url){
425 String fileName = null;
426 String fileSuffix = null;
427
428 // GIGO test (garbage in garbage out)
429 if (url != null){
430 // get the file name from the URL
431 fileName = url.getFile();
432
433 // tokenize this file name with "." as separator...
434 // the last token will be the file suffix
435 StringTokenizer st = new StringTokenizer(fileName,".");
436
437 // fileSuffix is the last token
438 while (st.hasMoreTokens())
439 fileSuffix = st.nextToken();
440 // here fileSuffix is the last token
441 } // End if
442 return fileSuffix;
443 }//getFileSufix
444
445 /**
446 * Find a DocumentFormat implementation that deals with a particular
447 * MIME type, given that type.
448 * @param aGateDocument this document will receive as a feature
449 * the associated Mime Type. The name of the feature is
450 * MimeType and its value is in the format type/subtype
451 * @param mimeType the mime type that is given as input
452 */
453 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
454 MimeType mimeType){
455 FeatureMap aFeatureMap = null;
456 if(mimeType == null) {
457 String content = aGateDocument.getContent().toString();
458 // reduce size for better performance
459 if(content.length() > 2048) content = content.substring(0, 2048);
460 mimeType = getTypeFromContent( content );
461 }
462
463 if (mimeType != null){
464 // If the Gate Document doesn't have a feature map atached then
465 // We will create and set one.
466 if(aGateDocument.getFeatures() == null){
467 aFeatureMap = Factory.newFeatureMap();
468 aGateDocument.setFeatures(aFeatureMap);
469 }// end if
470 aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
471 mimeType.getSubtype());
472
473 return mimeString2ClassHandlerMap.get(mimeType.getType()
474 + "/" + mimeType.getSubtype());
475 }// end If
476 return null;
477 } // getDocumentFormat(aGateDocument, MimeType)
478
479 /**
480 * Find a DocumentFormat implementation that deals with a particular
481 * MIME type, given the file suffix (e.g. ".txt") that the document came
482 * from.
483 * @param aGateDocument this document will receive as a feature
484 * the associated Mime Type. The name of the feature is
485 * MimeType and its value is in the format type/subtype
486 * @param fileSuffix the file suffix that is given as input
487 */
488 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
489 String fileSuffix) {
490 return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
491 } // getDocumentFormat(String)
492
493 /**
494 * Find a DocumentFormat implementation that deals with a particular
495 * MIME type, given the URL of the Document. If it is an HTTP URL, we
496 * can ask the web server. If it has a recognised file extension, we
497 * can use that. Otherwise we need to use a map of magic numbers
498 * to MIME types to guess the type, and then look up the format using the
499 * type.
500 * @param aGateDocument this document will receive as a feature
501 * the associated Mime Type. The name of the feature is
502 * MimeType and its value is in the format type/subtype
503 * @param url the URL that is given as input
504 */
505 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
506 URL url) {
507 return getDocumentFormat(aGateDocument, getMimeType(url));
508 } // getDocumentFormat(URL)
509
510 /** Get the feature set */
511 public FeatureMap getFeatures() { return features; }
512
513 /** Get the markup elements map */
514 public Map getMarkupElementsMap() { return markupElementsMap; }
515
516 /** Get the element 2 string map */
517 public Map getElement2StringMap() { return element2StringMap; }
518
519 /** Set the markup elements map */
520 public void setMarkupElementsMap(Map markupElementsMap) {
521 this.markupElementsMap = markupElementsMap;
522 }
523
524 /** Set the element 2 string map */
525 public void setElement2StringMap(Map anElement2StringMap) {
526 element2StringMap = anElement2StringMap;
527 }
528
529 /** Set the features map*/
530 public void setFeatures(FeatureMap features){this.features = features;}
531
532 /** Set the mime type*/
533
534 public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
535 /** Gets the mime Type*/
536 public MimeType getMimeType(){return mimeType;}
537
538
539 /**
540 * Utility method to get a {@link MimeType} given the type string.
541 */
542 public static MimeType getMimeTypeForString(String typeString) {
543 return mimeString2mimeTypeMap.get(typeString);
544 }
545
546 /**
547 * Utility method to get the set of all file suffixes that are registered
548 * with this class.
549 */
550 public static Set<String> getSupportedFileSuffixes() {
551 return Collections.unmodifiableSet(suffixes2mimeTypeMap.keySet());
552 }
553
554 //StatusReporter Implementation
555
556
557 public synchronized void removeStatusListener(StatusListener l) {
558 if (statusListeners != null && statusListeners.contains(l)) {
559 Vector v = (Vector) statusListeners.clone();
560 v.removeElement(l);
561 statusListeners = v;
562 }
563 }
564 public synchronized void addStatusListener(StatusListener l) {
565 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
566 if (!v.contains(l)) {
567 v.addElement(l);
568 statusListeners = v;
569 }
570 }
571 protected void fireStatusChanged(String e) {
572 if (statusListeners != null) {
573 Vector listeners = statusListeners;
574 int count = listeners.size();
575 for (int i = 0; i < count; i++) {
576 ((StatusListener) listeners.elementAt(i)).statusChanged(e);
577 }
578 }
579 }
580
581 } // class DocumentFormat
|