001 /*
002 * EmailDocumentFormat.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 3/Aug/2000
013 *
014 * $Id: EmailDocumentFormat.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.corpora;
018
019 import java.io.IOException;
020 import java.util.Iterator;
021
022 import gate.*;
023 import gate.creole.ResourceInstantiationException;
024 import gate.creole.metadata.AutoInstance;
025 import gate.creole.metadata.CreoleResource;
026 import gate.email.EmailDocumentHandler;
027 import gate.event.StatusListener;
028 import gate.util.DocumentFormatException;
029 import gate.util.InvalidOffsetException;
030
031 //import org.w3c.www.mime.*;
032
033 /** The format of Documents. Subclasses of DocumentFormat know about
034 * particular MIME types and how to unpack the information in any
035 * markup or formatting they contain into GATE annotations. Each MIME
036 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
037 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
038 * with a static index residing here when they are constructed. Static
039 * getDocumentFormat methods can then be used to get the appropriate
040 * format class for a particular document.
041 */
042 @CreoleResource(name = "GATE EMAIL Document Format", isPrivate = true,
043 autoinstances = {@AutoInstance(hidden = true)})
044 public class EmailDocumentFormat extends TextualDocumentFormat
045 {
046 /** Debug flag */
047 private static final boolean DEBUG = false;
048
049 /** Default construction */
050 public EmailDocumentFormat() { super();}
051
052 /** Unpack the markup in the document. This converts markup from the
053 * native format (e.g. EMAIL) into annotations in GATE format.
054 * Uses the markupElementsMap to determine which elements to convert, and
055 * what annotation type names to use.
056 * It always tryes to parse te doc's content. It doesn't matter if the
057 * sourceUrl is null or not.
058 *
059 * @param doc The gate document you want to parse.
060 *
061 */
062
063 public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
064 if ( (doc == null) ||
065 (doc.getSourceUrl() == null && doc.getContent() == null)){
066
067 throw new DocumentFormatException(
068 "GATE document is null or no content found. Nothing to parse!");
069 }// End if
070
071 setNewLineProperty(doc);
072
073 // create an EmailDocumentHandler
074 EmailDocumentHandler emailDocHandler = null;
075 emailDocHandler = new gate.email.EmailDocumentHandler(
076 doc,
077 this.markupElementsMap,
078 this.element2StringMap);
079 StatusListener statusListener = new StatusListener(){
080 public void statusChanged(String text) {
081 // this is implemented in DocumentFormat.java and inherited here
082 fireStatusChanged(text);
083 }//statusChanged(String text)
084 };
085 // Register a status listener with it
086 emailDocHandler.addStatusListener(statusListener);
087 try{
088 // Call the method that creates annotations on the gate document
089 emailDocHandler.annotateMessages();
090 // Process the body annotations and search for paragraphs
091 AnnotationSet bodyAnnotations = doc.getAnnotations(
092 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
093 if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){
094 Iterator<Annotation> iter = bodyAnnotations.iterator();
095 while(iter.hasNext()){
096 Annotation a = iter.next();
097 annotateParagraphs(doc,a.getStartNode().getOffset().intValue(),
098 a.getEndNode().getOffset().intValue(),
099 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
100 }// End while
101 }// End if
102 } catch (IOException e){
103 throw new DocumentFormatException("Couldn't create a buffered reader ",e);
104 } catch (InvalidOffsetException e){
105 throw new DocumentFormatException(e);
106 }finally{
107 emailDocHandler.removeStatusListener(statusListener);
108 }// End try
109 }//unpackMarkup(doc)
110
111 /** Initialise this resource, and return it. */
112 public Resource init() throws ResourceInstantiationException{
113 // Register EMAIL mime type
114 MimeType mime = new MimeType("text","email");
115 // Register the class handler for this mime type
116 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
117 this);
118 // Register the mime type with mine string
119 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
120 // Register file sufixes for this mime type
121 suffixes2mimeTypeMap.put("eml",mime);
122 suffixes2mimeTypeMap.put("email",mime);
123 suffixes2mimeTypeMap.put("mail",mime);
124 // Register magic numbers for this mime type
125 magic2mimeTypeMap.put("Subject:",mime);
126 // Set the mimeType for this language resource
127 setMimeType(mime);
128 return this;
129 }// init()
130 }// class EmailDocumentFormat
|