001 package gate.corpora;
002
003 import gate.Document;
004 import gate.DocumentFormat;
005 import gate.Resource;
006 import gate.corpora.DocumentImpl;
007 import gate.corpora.MimeType;
008 import gate.corpora.RepositioningInfo;
009 import gate.creole.ResourceInstantiationException;
010 import gate.creole.metadata.AutoInstance;
011 import gate.creole.metadata.CreoleResource;
012 import gate.event.StatusListener;
013 import gate.util.DocumentFormatException;
014 import gate.xml.XmlDocumentHandler;
015
016 import java.io.File;
017 import java.io.IOException;
018 import java.io.InputStream;
019 import java.net.URISyntaxException;
020 import java.util.Map;
021
022 import org.apache.commons.io.IOUtils;
023 import org.apache.log4j.Logger;
024 import org.apache.tika.config.TikaConfig;
025 import org.apache.tika.exception.TikaException;
026 import org.apache.tika.metadata.Metadata;
027 import org.apache.tika.parser.AutoDetectParser;
028 import org.apache.tika.parser.CompositeParser;
029 import org.apache.tika.parser.ParseContext;
030 import org.apache.tika.parser.Parser;
031 import org.xml.sax.SAXException;
032
033 @CreoleResource(name = "Apache Tika Document Format", isPrivate = true, autoinstances = {@AutoInstance(hidden = true)})
034 public class TikaFormat extends DocumentFormat {
035
036 private static final long serialVersionUID = 1L;
037
038 private static final Logger log = Logger.getLogger(TikaFormat.class);
039
040 @Override
041 public Resource init() throws ResourceInstantiationException {
042 super.init();
043 setMimeType(new MimeType("application","tika"));
044 assignMime(getMimeType());
045 assignMime(new MimeType("application","pdf"), "pdf");
046 assignMime(new MimeType("application","msword"), "doc");
047 assignMime(new MimeType("application","vnd.ms-powerpoint"), "ppt");
048 assignMime(new MimeType("application","vnd.ms-excel"), "xls");
049 assignMime(new MimeType("application","vnd.openxmlformats-officedocument.wordprocessingml.document"), "docx");
050 assignMime(new MimeType("application","vnd.openxmlformats-officedocument.presentationml.presentation"), "pptx");
051 assignMime(new MimeType("application","vnd.openxmlformats-officedocument.spreadsheetml.sheet"), "xlsx");
052 assignMime(new MimeType("application", "vnd.oasis.opendocument.text"), "odt");
053 assignMime(new MimeType("application", "vnd.oasis.opendocument.presentation"), "odp");
054 assignMime(new MimeType("application", "vnd.oasis.opendocument.spreadsheet"), "ods");
055 assignMime(new MimeType("application", "rtf"), "rtf");
056
057 //There are bugs in Tika related to ePub as of 0.7
058 //assignMime(new MimeType("application", "epub+zip"), "epub");
059 return this;
060 }
061
062 private void assignMime(MimeType mime, String... exts) {
063 String mimeString = mime.getType()+ "/" + mime.getSubtype();
064 mimeString2ClassHandlerMap.put(mimeString, this);
065 mimeString2mimeTypeMap.put(mimeString, mime);
066 for (String ext : exts)
067 suffixes2mimeTypeMap.put(ext,mime);
068 }
069
070 @Override
071 public Boolean supportsRepositioning() {
072 return true;
073 }
074
075 @Override
076 public void unpackMarkup(Document doc) throws DocumentFormatException {
077 unpackMarkup(doc, null, null);
078
079 }
080
081 @Override
082 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
083 RepositioningInfo ampCodingInfo) throws DocumentFormatException {
084 if(doc == null || doc.getSourceUrl() == null) {
085
086 throw new DocumentFormatException(
087 "GATE document is null or no content found. Nothing to parse!");
088 }// End if
089
090 // Create a status listener
091 StatusListener statusListener = new StatusListener() {
092 public void statusChanged(String text) {
093 // This is implemented in DocumentFormat.java and inherited here
094 fireStatusChanged(text);
095 }
096 };
097
098 Parser tikaParser = createParser();
099 XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap,
100 this.element2StringMap);
101 Metadata metadata = extractParserTips(doc);
102
103 ch.addStatusListener(statusListener);
104 ch.setRepositioningInfo(repInfo);
105 // set the object with ampersand coding positions
106 ch.setAmpCodingInfo(ampCodingInfo);
107 InputStream input = null;
108 try {
109 input = doc.getSourceUrl().openStream();
110 tikaParser.parse(input, ch, metadata, new ParseContext());
111 setDocumentFeatures(metadata, doc);
112 } catch (IOException e) {
113 throw new DocumentFormatException(e);
114 } catch (SAXException e) {
115 throw new DocumentFormatException(e);
116 } catch (TikaException e) {
117 throw new DocumentFormatException(e);
118 }
119 finally {
120 IOUtils.closeQuietly(input); // null safe
121 ch.removeStatusListener(statusListener);
122 }
123
124 if (doc instanceof DocumentImpl) {
125 ((DocumentImpl)doc).setNextAnnotationId(ch.getCustomObjectsId());
126 }
127 }
128
129 private Parser createParser() {
130 Parser fallback = new AutoDetectParser();
131 TikaConfig conf = TikaConfig.getDefaultConfig();
132
133 // This composite parser will always honor the input mime type
134 // if the mimetype isn't recognized it will use the auto detect parser
135 CompositeParser p = new CompositeParser();
136 p.setFallback(fallback);
137 p.setParsers(conf.getParsers());
138 return p;
139 }
140
141 private void setDocumentFeatures(Metadata metadata, Document doc) {
142 Map fmap = doc.getFeatures();
143 setTikaFeature(metadata, Metadata.TITLE, fmap);
144 setTikaFeature(metadata, Metadata.AUTHOR, fmap);
145 setTikaFeature(metadata, Metadata.COMMENTS, fmap);
146 setTikaFeature(metadata, Metadata.CREATOR, fmap);
147 if (fmap.get("AUTHORS") == null && fmap.get("AUTHOR") != null)
148 fmap.put("AUTHORS", fmap.get(Metadata.AUTHOR));
149 fmap.put("MimeType", metadata.get(Metadata.CONTENT_TYPE));
150 }
151
152 private void setTikaFeature(Metadata metadata, String key, Map fmap) {
153 String value = metadata.get(key);
154 if (value == null) {
155 return;
156 }
157
158 value = value.trim();
159 if (value.length() == 0) {
160 return;
161 }
162 key = key.toUpperCase();
163 if (fmap.containsKey(key)) {
164 fmap.put("TIKA_" + key, value);
165 }
166 else {
167 fmap.put(key, value);
168 fmap.put("TIKA_" + key, value);
169 }
170 }
171
172 /**
173 * Tries to extract tips for the parser as specified here -
174 * http://tika.apache.org/0.7/parser.html . The tips are not critical
175 * for successful parsing.
176 *
177 * @param doc
178 * @return metadata, not null but may be empty
179 */
180 private Metadata extractParserTips(Document doc) {
181 Metadata metadata = new Metadata();
182 Object inputMime = doc.getFeatures().get("MimeType");
183 if (inputMime instanceof String) {
184 if (!"application/tika".equals(inputMime)) {
185 metadata.add(Metadata.CONTENT_TYPE, (String) doc.getFeatures().get("MimeType"));
186 }
187 }
188 if (doc instanceof DocumentImpl) {
189 if (((DocumentImpl)doc).getMimeType() != null) {
190 metadata.add(Metadata.CONTENT_TYPE, ((DocumentImpl)doc).getMimeType());
191 }
192 }
193 if (doc.getSourceUrl() != null && doc.getSourceUrl().getProtocol().startsWith("file")) {
194 try {
195 File fn =new File(doc.getSourceUrl().toURI());
196 metadata.add(Metadata.RESOURCE_NAME_KEY, fn.getName());
197 } catch (URISyntaxException e) {
198 log.debug("Could not extract filename from uri: " + doc.getSourceUrl(), e);
199 } catch (IllegalArgumentException e) {
200 log.debug("Could not extract filename from uri: " + doc.getSourceUrl(), e);
201 }
202 }
203 return metadata;
204 }
205 }
|