SentenceSplitter.java
001 /*
002  *  Copyright (c) 1995-2011, The University of Sheffield. See the file
003  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004  *
005  *  This file is part of GATE (see http://gate.ac.uk/), and is free
006  *  software, licenced under the GNU Library General Public License,
007  *  Version 2, June 1991 (in the distribution as file licence.html,
008  *  and also available at http://gate.ac.uk/gate/licence.html).
009  *
010  *  Valentin Tablan, 01 Feb 2000
011  *
012  *  $Id: SentenceSplitter.java 13406 2011-02-05 18:53:16Z ian_roberts $
013  */
014 
015 package gate.creole.splitter;
016 
017 import gate.AnnotationSet;
018 import gate.Factory;
019 import gate.FeatureMap;
020 import gate.Gate;
021 import gate.Resource;
022 import gate.creole.AbstractLanguageAnalyser;
023 import gate.creole.ExecutionException;
024 import gate.creole.ExecutionInterruptedException;
025 import gate.creole.ResourceInstantiationException;
026 import gate.creole.Transducer;
027 import gate.creole.gazetteer.DefaultGazetteer;
028 import gate.event.ProgressListener;
029 import gate.event.StatusListener;
030 import gate.util.Benchmark;
031 import gate.util.Benchmarkable;
032 import gate.util.GateRuntimeException;
033 import gate.util.InvalidOffsetException;
034 
035 /**
036  * A sentence splitter. This is module contains a tokeniser, a
037  * gazetteer and a Jape grammar. This class is used so we can have a different
038  * entry in the creole.xml file describing the default resources and to add
039  * some minor processing after running the components in order to extract the
040  * results in a usable form.
041  */
042 public class SentenceSplitter extends AbstractLanguageAnalyser implements Benchmarkable{
043 
044   public static final String
045     SPLIT_DOCUMENT_PARAMETER_NAME = "document";
046 
047   public static final String
048     SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
049 
050   public static final String
051     SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
052 
053   public static final String
054     SPLIT_ENCODING_PARAMETER_NAME = "encoding";
055 
056   public static final String
057     SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
058 
059   public static final String
060     SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
061   
062   
063   private String benchmarkId;
064 
065   public Resource init()throws ResourceInstantiationException{
066     //create all the componets
067     FeatureMap params;
068     FeatureMap features;
069 
070     params = Factory.newFeatureMap();
071     if(gazetteerListsURL != null)
072       params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
073               gazetteerListsURL);
074     params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
075 
076     if (gazetteer == null) {
077       //gazetteer
078       fireStatusChanged("Creating the gazetteer");
079       features = Factory.newFeatureMap();
080       Gate.setHiddenAttribute(features, true);
081 
082       gazetteer = (DefaultGazetteer)Factory.createResource(
083               "gate.creole.gazetteer.DefaultGazetteer",
084               params, features);
085       gazetteer.setName("Gazetteer " + System.currentTimeMillis());
086     }
087     else {
088       gazetteer.setParameterValues(params);
089       gazetteer.reInit();
090     }
091     
092     fireProgressChanged(10);
093 
094     params = Factory.newFeatureMap();
095     if(transducerURL != null)
096       params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
097     params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
098 
099     if (transducer == null) {
100       //transducer
101       fireStatusChanged("Creating the JAPE transducer");
102       features = Factory.newFeatureMap();
103       Gate.setHiddenAttribute(features, true);
104 
105       transducer = (Transducer)Factory.createResource(
106               "gate.creole.Transducer",
107               params, features);
108       transducer.setName("Transducer " + System.currentTimeMillis());
109     }
110     else {
111       transducer.setParameterValues(params);
112       transducer.reInit();
113     }
114     
115     fireProgressChanged(100);
116     fireProcessFinished();
117 
118     return this;
119   }
120   
121   public void cleanup() {
122     Factory.deleteResource(gazetteer);
123     Factory.deleteResource(transducer);
124   }
125 
126   public void execute() throws ExecutionException{
127     interrupted = false;
128     //set the runtime parameters
129     FeatureMap params;
130     if(inputASName != null && inputASName.equals("")) inputASName = null;
131     if(outputASName != null && outputASName.equals("")) outputASName = null;
132     try{
133       fireProgressChanged(0);
134       params = Factory.newFeatureMap();
135       params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
136       params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
137       gazetteer.setParameterValues(params);
138 
139       params = Factory.newFeatureMap();
140       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
141       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
142       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
143       transducer.setParameterValues(params);
144     }catch(Exception e){
145       throw new ExecutionException(e);
146     }
147     ProgressListener pListener = null;
148     StatusListener sListener = null;
149     fireProgressChanged(5);
150 
151     //run the gazetteer
152     if(isInterrupted()) throw new ExecutionInterruptedException(
153         "The execution of the \"" + getName() +
154         "\" sentence splitter has been abruptly interrupted!");
155     pListener = new IntervalProgressListener(510);
156     sListener = new StatusListener(){
157       public void statusChanged(String text){
158         fireStatusChanged(text);
159       }
160     };
161     gazetteer.addProgressListener(pListener);
162     gazetteer.addStatusListener(sListener);
163     gazetteer.execute();
164     gazetteer.removeProgressListener(pListener);
165     gazetteer.removeStatusListener(sListener);
166 
167     //run the transducer
168     if(isInterrupted()) throw new ExecutionInterruptedException(
169         "The execution of the \"" + getName() +
170         "\" sentence splitter has been abruptly interrupted!");
171     pListener = new IntervalProgressListener(1190);
172     transducer.addProgressListener(pListener);
173     transducer.addStatusListener(sListener);
174     Benchmark.executeWithBenchmarking(transducer,
175             Benchmark.createBenchmarkId("SentenceSplitterTransducer",
176                     getBenchmarkId()), this, null);
177     transducer.removeProgressListener(pListener);
178     transducer.removeStatusListener(sListener);
179 
180     //get pointers to the annotation sets
181     AnnotationSet inputAS = (inputASName == null?
182                             document.getAnnotations() :
183                             document.getAnnotations(inputASName);
184 
185     AnnotationSet outputAS = (outputASName == null?
186                              document.getAnnotations() :
187                              document.getAnnotations(outputASName);
188 
189     //copy the results to the output set if they are different
190     if(inputAS != outputAS){
191       outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
192     }
193 
194     //create one big sentence if none were found
195     AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
196     if(sentences == null || sentences.isEmpty()){
197       //create an annotation covering the entire content
198       try{
199         outputAS.add(new Long(0), document.getContent().size()
200                 SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
201       }catch(InvalidOffsetException ioe){
202         throw new GateRuntimeException(ioe);
203       }
204     }else{
205       //add a sentence covering all the tokens after the last sentence
206       Long endSentences = sentences.lastNode().getOffset();
207       AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
208                                                   inputAS.lastNode().getOffset());
209       if(remainingTokens != null && !remainingTokens.isEmpty()){
210         try{
211           outputAS.add(remainingTokens.firstNode().getOffset(),
212                        remainingTokens.lastNode().getOffset(),
213                        SENTENCE_ANNOTATION_TYPE,
214                        Factory.newFeatureMap());
215         }catch(InvalidOffsetException ioe){
216           throw new ExecutionException(ioe);
217         }
218       }
219     }
220     fireProcessFinished();
221   }//execute()
222 
223   /**
224    * Notifies all the PRs in this controller that they should stop their
225    * execution as soon as possible.
226    */
227   public synchronized void interrupt(){
228     interrupted = true;
229     gazetteer.interrupt();
230     transducer.interrupt();
231   }
232 
233   public void setTransducerURL(java.net.URL newTransducerURL) {
234     transducerURL = newTransducerURL;
235   }
236   public java.net.URL getTransducerURL() {
237     return transducerURL;
238   }
239   DefaultGazetteer gazetteer;
240   Transducer transducer;
241   private java.net.URL transducerURL;
242   private String encoding;
243   private java.net.URL gazetteerListsURL;
244 
245 
246   public void setEncoding(String newEncoding) {
247     encoding = newEncoding;
248   }
249   public String getEncoding() {
250     return encoding;
251   }
252   public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
253     gazetteerListsURL = newGazetteerListsURL;
254   }
255   public java.net.URL getGazetteerListsURL() {
256     return gazetteerListsURL;
257   }
258   public void setInputASName(String newInputASName) {
259     inputASName = newInputASName;
260   }
261 
262   public String getInputASName() {
263     return inputASName;
264   }
265   public void setOutputASName(String newOutputASName) {
266     outputASName = newOutputASName;
267   }
268   public String getOutputASName() {
269     return outputASName;
270   }
271   
272   /* (non-Javadoc)
273    * @see gate.util.Benchmarkable#getBenchmarkId()
274    */
275   public String getBenchmarkId() {
276     if(benchmarkId == null) {
277       return getName();
278     }
279     else {
280       return benchmarkId;
281     }
282   }
283 
284   /* (non-Javadoc)
285    * @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
286    */
287   public void setBenchmarkId(String benchmarkId) {
288     this.benchmarkId = benchmarkId;
289   }
290 
291 
292 
293   private static final boolean DEBUG = false;
294   private String inputASName;
295   private String outputASName;
296 }//public class SentenceSplitter extends Nerc