WebAnnie.java
001 package gate.util.web;
002 
003 import java.io.IOException;
004 import java.net.MalformedURLException;
005 import java.net.URL;
006 import java.util.HashSet;
007 import java.util.Set;
008 
009 import javax.servlet.ServletContext;
010 
011 import gate.*;
012 import gate.creole.SerialAnalyserController;
013 import gate.util.GateException;
014 
015 /**
016  * This class is designed to demonstrate ANNIE in a web context. It should be
017  * called from either a servlet or a JSP.
018  */
019 public class WebAnnie  {
020     
021     public static final String GATE_INIT_KEY = "gate.init";
022     public static final String ANNIE_CONTROLLER_KEY = "annie.controller";
023 
024     /** The Corpus Pipeline application to contain ANNIE */
025     private SerialAnalyserController annieController;
026     
027     private String filePath = "";
028 
029     /**
030      * Initialise the ANNIE system. This creates a "corpus pipeline"
031      * application that can be used to run sets of documents through
032      * the extraction system.
033      */
034     private void initAnnie() throws GateException {
035         
036         // create a serial analyser controller to run ANNIE with
037         annieController = (SerialAnalyserController)
038             Factory.createResource("gate.creole.SerialAnalyserController",
039                                    Factory.newFeatureMap(),
040                                    Factory.newFeatureMap(),
041                                    "ANNIE_" + Gate.genSym()
042                                    );
043         
044         // Load tokenizer
045         ProcessingResource tokeniser = (ProcessingResource)
046             Factory.createResource("gate.creole.tokeniser.DefaultTokeniser",
047                                    Factory.newFeatureMap());
048         
049         annieController.add(tokeniser);
050         
051         // Load sentence splitter
052         ProcessingResource split = (ProcessingResource)
053             Factory.createResource("gate.creole.splitter.SentenceSplitter",
054                                    Factory.newFeatureMap());
055         
056         annieController.add(split);
057         
058         // Load POS tagger
059         ProcessingResource postagger = (ProcessingResource)
060             Factory.createResource("gate.creole.POSTagger",
061                                    Factory.newFeatureMap());
062         
063         annieController.add(postagger);
064 
065 
066         // Load Gazetteer -- this is a two step process
067         FeatureMap gazetteerFeatures = Factory.newFeatureMap();
068         gazetteerFeatures.put("encoding","ISO-8859-1");
069 
070         // Step one: Locate the gazetteer file
071         try {
072             URL gazetteerURL =
073                 new URL("jar:file:" + filePath +
074                         "muse.jar!/muse/resources/gazetteer/lists.def");
075             gazetteerFeatures.put("listsURL", gazetteerURL);
076         catch(MalformedURLException e) {
077             e.printStackTrace();
078         }
079         
080         // Step two: Load the gazetteer from the file
081         ProcessingResource gazetteer = (ProcessingResource)
082             Factory.createResource("gate.creole.gazetteer.DefaultGazetteer",
083                                    gazetteerFeatures);
084         
085         annieController.add(gazetteer);        
086 
087         // Load Grammar -- similar to gazetteer
088         FeatureMap grammarFeatures = Factory.newFeatureMap();
089         
090         try {
091             URL grammarURL =
092                 new URL("jar:file:" + filePath +
093                         "muse.jar!/muse/resources/grammar/main/main.jape");
094             grammarFeatures.put("grammarURL", grammarURL);
095         catch(MalformedURLException e) {
096             e.printStackTrace();
097         }
098         
099         ProcessingResource grammar = (ProcessingResource)
100             Factory.createResource("gate.creole.ANNIETransducer",
101                                    grammarFeatures);
102         
103         annieController.add(grammar);
104 
105         // Load Ortho Matcher
106         ProcessingResource orthoMatcher = (ProcessingResource)
107             Factory.createResource("gate.creole.orthomatcher.OrthoMatcher",
108                                    Factory.newFeatureMap());
109         
110         annieController.add(orthoMatcher);
111 
112     // initAnnie()
113     
114     /**
115      * This method should be called from a servlet or JSP.
116      @param app The current servlet context, eg the JSP implicit variable "application"
117      @param url The url of the file to be analysed
118      @param annotations An array of annotations
119      */
120     public String process(ServletContext app, String url, String[] annotations)
121         throws GateException, IOException {
122 
123         if (app.getAttribute(GATE_INIT_KEY== null) {
124             Gate.setLocalWebServer(false);
125             Gate.setNetConnected(false);
126 
127             System.setProperty("java.protocol.handler.pkgs",
128                                "gate.util.protocols");
129             
130             // Do the deed
131             Gate.init();
132 
133             app.setAttribute(GATE_INIT_KEY, "true");
134         }
135 
136         if (app.getAttribute(ANNIE_CONTROLLER_KEY== null) {
137             // initialise ANNIE (this may take several minutes)
138 
139             filePath = app.getInitParameter("muse.path");
140             this.initAnnie();
141 
142             app.setAttribute(ANNIE_CONTROLLER_KEY, annieController);
143         }
144         else {
145             annieController = (SerialAnalyserController
146                 app.getAttribute(ANNIE_CONTROLLER_KEY);
147         }
148 
149         
150         // create a GATE corpus and add a document from the URL specified
151         Corpus corpus =
152             (CorpusFactory.createResource("gate.corpora.CorpusImpl");
153         URL u = new URL(url);
154         FeatureMap params = Factory.newFeatureMap();
155         params.put("sourceUrl", u);
156 
157         Document doc = (Document)
158             Factory.createResource("gate.corpora.DocumentImpl", params);
159         corpus.add(doc);
160             
161         
162         // tell the pipeline about the corpus and run it
163         annieController.setCorpus(corpus);
164         annieController.execute();
165         
166         // Get XML marked up document
167         AnnotationSet defaultAnnotSet = doc.getAnnotations();
168         Set annotTypesRequired = new HashSet();
169 
170         String output = null;
171         if (annotations != null) {
172             for (int i=0;i<annotations.length;i++) {
173                 annotTypesRequired.add(annotations[i]);
174             }
175             AnnotationSet selectedAnnotations =
176                 defaultAnnotSet.get(annotTypesRequired);
177             output = doc.toXml(selectedAnnotations, true);
178         }
179         else {
180             output = doc.toXml();
181         }
182         //delete the used resources
183         Factory.deleteResource(doc);
184         Factory.deleteResource(corpus);
185         return output;
186     // process
187     
188 // class WebAnnie