001 package gate.util.web;
002
003 import java.io.IOException;
004 import java.net.MalformedURLException;
005 import java.net.URL;
006 import java.util.HashSet;
007 import java.util.Set;
008
009 import javax.servlet.ServletContext;
010
011 import gate.*;
012 import gate.creole.SerialAnalyserController;
013 import gate.util.GateException;
014
015 /**
016 * This class is designed to demonstrate ANNIE in a web context. It should be
017 * called from either a servlet or a JSP.
018 */
019 public class WebAnnie {
020
021 public static final String GATE_INIT_KEY = "gate.init";
022 public static final String ANNIE_CONTROLLER_KEY = "annie.controller";
023
024 /** The Corpus Pipeline application to contain ANNIE */
025 private SerialAnalyserController annieController;
026
027 private String filePath = "";
028
029 /**
030 * Initialise the ANNIE system. This creates a "corpus pipeline"
031 * application that can be used to run sets of documents through
032 * the extraction system.
033 */
034 private void initAnnie() throws GateException {
035
036 // create a serial analyser controller to run ANNIE with
037 annieController = (SerialAnalyserController)
038 Factory.createResource("gate.creole.SerialAnalyserController",
039 Factory.newFeatureMap(),
040 Factory.newFeatureMap(),
041 "ANNIE_" + Gate.genSym()
042 );
043
044 // Load tokenizer
045 ProcessingResource tokeniser = (ProcessingResource)
046 Factory.createResource("gate.creole.tokeniser.DefaultTokeniser",
047 Factory.newFeatureMap());
048
049 annieController.add(tokeniser);
050
051 // Load sentence splitter
052 ProcessingResource split = (ProcessingResource)
053 Factory.createResource("gate.creole.splitter.SentenceSplitter",
054 Factory.newFeatureMap());
055
056 annieController.add(split);
057
058 // Load POS tagger
059 ProcessingResource postagger = (ProcessingResource)
060 Factory.createResource("gate.creole.POSTagger",
061 Factory.newFeatureMap());
062
063 annieController.add(postagger);
064
065
066 // Load Gazetteer -- this is a two step process
067 FeatureMap gazetteerFeatures = Factory.newFeatureMap();
068 gazetteerFeatures.put("encoding","ISO-8859-1");
069
070 // Step one: Locate the gazetteer file
071 try {
072 URL gazetteerURL =
073 new URL("jar:file:" + filePath +
074 "muse.jar!/muse/resources/gazetteer/lists.def");
075 gazetteerFeatures.put("listsURL", gazetteerURL);
076 } catch(MalformedURLException e) {
077 e.printStackTrace();
078 }
079
080 // Step two: Load the gazetteer from the file
081 ProcessingResource gazetteer = (ProcessingResource)
082 Factory.createResource("gate.creole.gazetteer.DefaultGazetteer",
083 gazetteerFeatures);
084
085 annieController.add(gazetteer);
086
087 // Load Grammar -- similar to gazetteer
088 FeatureMap grammarFeatures = Factory.newFeatureMap();
089
090 try {
091 URL grammarURL =
092 new URL("jar:file:" + filePath +
093 "muse.jar!/muse/resources/grammar/main/main.jape");
094 grammarFeatures.put("grammarURL", grammarURL);
095 } catch(MalformedURLException e) {
096 e.printStackTrace();
097 }
098
099 ProcessingResource grammar = (ProcessingResource)
100 Factory.createResource("gate.creole.ANNIETransducer",
101 grammarFeatures);
102
103 annieController.add(grammar);
104
105 // Load Ortho Matcher
106 ProcessingResource orthoMatcher = (ProcessingResource)
107 Factory.createResource("gate.creole.orthomatcher.OrthoMatcher",
108 Factory.newFeatureMap());
109
110 annieController.add(orthoMatcher);
111
112 } // initAnnie()
113
114 /**
115 * This method should be called from a servlet or JSP.
116 * @param app The current servlet context, eg the JSP implicit variable "application"
117 * @param url The url of the file to be analysed
118 * @param annotations An array of annotations
119 */
120 public String process(ServletContext app, String url, String[] annotations)
121 throws GateException, IOException {
122
123 if (app.getAttribute(GATE_INIT_KEY) == null) {
124 Gate.setLocalWebServer(false);
125 Gate.setNetConnected(false);
126
127 System.setProperty("java.protocol.handler.pkgs",
128 "gate.util.protocols");
129
130 // Do the deed
131 Gate.init();
132
133 app.setAttribute(GATE_INIT_KEY, "true");
134 }
135
136 if (app.getAttribute(ANNIE_CONTROLLER_KEY) == null) {
137 // initialise ANNIE (this may take several minutes)
138
139 filePath = app.getInitParameter("muse.path");
140 this.initAnnie();
141
142 app.setAttribute(ANNIE_CONTROLLER_KEY, annieController);
143 }
144 else {
145 annieController = (SerialAnalyserController)
146 app.getAttribute(ANNIE_CONTROLLER_KEY);
147 }
148
149
150 // create a GATE corpus and add a document from the URL specified
151 Corpus corpus =
152 (Corpus) Factory.createResource("gate.corpora.CorpusImpl");
153 URL u = new URL(url);
154 FeatureMap params = Factory.newFeatureMap();
155 params.put("sourceUrl", u);
156
157 Document doc = (Document)
158 Factory.createResource("gate.corpora.DocumentImpl", params);
159 corpus.add(doc);
160
161
162 // tell the pipeline about the corpus and run it
163 annieController.setCorpus(corpus);
164 annieController.execute();
165
166 // Get XML marked up document
167 AnnotationSet defaultAnnotSet = doc.getAnnotations();
168 Set annotTypesRequired = new HashSet();
169
170 String output = null;
171 if (annotations != null) {
172 for (int i=0;i<annotations.length;i++) {
173 annotTypesRequired.add(annotations[i]);
174 }
175 AnnotationSet selectedAnnotations =
176 defaultAnnotSet.get(annotTypesRequired);
177 output = doc.toXml(selectedAnnotations, true);
178 }
179 else {
180 output = doc.toXml();
181 }
182 //delete the used resources
183 Factory.deleteResource(doc);
184 Factory.deleteResource(corpus);
185 return output;
186 } // process
187
188 } // class WebAnnie
|