001 /*
002 * Copyright (c) 1995-2011, The University of Sheffield. See the file
003 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004 *
005 * This file is part of GATE (see http://gate.ac.uk/), and is free
006 * software, licenced under the GNU Library General Public License,
007 * Version 2, June 1991 (in the distribution as file licence.html,
008 * and also available at http://gate.ac.uk/gate/licence.html).
009 *
010 * Valentin Tablan, 01 Feb 2000
011 *
012 * $Id: SentenceSplitter.java 13406 2011-02-05 18:53:16Z ian_roberts $
013 */
014
015 package gate.creole.splitter;
016
017 import gate.AnnotationSet;
018 import gate.Factory;
019 import gate.FeatureMap;
020 import gate.Gate;
021 import gate.Resource;
022 import gate.creole.AbstractLanguageAnalyser;
023 import gate.creole.ExecutionException;
024 import gate.creole.ExecutionInterruptedException;
025 import gate.creole.ResourceInstantiationException;
026 import gate.creole.Transducer;
027 import gate.creole.gazetteer.DefaultGazetteer;
028 import gate.event.ProgressListener;
029 import gate.event.StatusListener;
030 import gate.util.Benchmark;
031 import gate.util.Benchmarkable;
032 import gate.util.GateRuntimeException;
033 import gate.util.InvalidOffsetException;
034
035 /**
036 * A sentence splitter. This is module contains a tokeniser, a
037 * gazetteer and a Jape grammar. This class is used so we can have a different
038 * entry in the creole.xml file describing the default resources and to add
039 * some minor processing after running the components in order to extract the
040 * results in a usable form.
041 */
042 public class SentenceSplitter extends AbstractLanguageAnalyser implements Benchmarkable{
043
044 public static final String
045 SPLIT_DOCUMENT_PARAMETER_NAME = "document";
046
047 public static final String
048 SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
049
050 public static final String
051 SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
052
053 public static final String
054 SPLIT_ENCODING_PARAMETER_NAME = "encoding";
055
056 public static final String
057 SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
058
059 public static final String
060 SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
061
062
063 private String benchmarkId;
064
065 public Resource init()throws ResourceInstantiationException{
066 //create all the componets
067 FeatureMap params;
068 FeatureMap features;
069
070 params = Factory.newFeatureMap();
071 if(gazetteerListsURL != null)
072 params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
073 gazetteerListsURL);
074 params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
075
076 if (gazetteer == null) {
077 //gazetteer
078 fireStatusChanged("Creating the gazetteer");
079 features = Factory.newFeatureMap();
080 Gate.setHiddenAttribute(features, true);
081
082 gazetteer = (DefaultGazetteer)Factory.createResource(
083 "gate.creole.gazetteer.DefaultGazetteer",
084 params, features);
085 gazetteer.setName("Gazetteer " + System.currentTimeMillis());
086 }
087 else {
088 gazetteer.setParameterValues(params);
089 gazetteer.reInit();
090 }
091
092 fireProgressChanged(10);
093
094 params = Factory.newFeatureMap();
095 if(transducerURL != null)
096 params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
097 params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
098
099 if (transducer == null) {
100 //transducer
101 fireStatusChanged("Creating the JAPE transducer");
102 features = Factory.newFeatureMap();
103 Gate.setHiddenAttribute(features, true);
104
105 transducer = (Transducer)Factory.createResource(
106 "gate.creole.Transducer",
107 params, features);
108 transducer.setName("Transducer " + System.currentTimeMillis());
109 }
110 else {
111 transducer.setParameterValues(params);
112 transducer.reInit();
113 }
114
115 fireProgressChanged(100);
116 fireProcessFinished();
117
118 return this;
119 }
120
121 public void cleanup() {
122 Factory.deleteResource(gazetteer);
123 Factory.deleteResource(transducer);
124 }
125
126 public void execute() throws ExecutionException{
127 interrupted = false;
128 //set the runtime parameters
129 FeatureMap params;
130 if(inputASName != null && inputASName.equals("")) inputASName = null;
131 if(outputASName != null && outputASName.equals("")) outputASName = null;
132 try{
133 fireProgressChanged(0);
134 params = Factory.newFeatureMap();
135 params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
136 params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
137 gazetteer.setParameterValues(params);
138
139 params = Factory.newFeatureMap();
140 params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
141 params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
142 params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
143 transducer.setParameterValues(params);
144 }catch(Exception e){
145 throw new ExecutionException(e);
146 }
147 ProgressListener pListener = null;
148 StatusListener sListener = null;
149 fireProgressChanged(5);
150
151 //run the gazetteer
152 if(isInterrupted()) throw new ExecutionInterruptedException(
153 "The execution of the \"" + getName() +
154 "\" sentence splitter has been abruptly interrupted!");
155 pListener = new IntervalProgressListener(5, 10);
156 sListener = new StatusListener(){
157 public void statusChanged(String text){
158 fireStatusChanged(text);
159 }
160 };
161 gazetteer.addProgressListener(pListener);
162 gazetteer.addStatusListener(sListener);
163 gazetteer.execute();
164 gazetteer.removeProgressListener(pListener);
165 gazetteer.removeStatusListener(sListener);
166
167 //run the transducer
168 if(isInterrupted()) throw new ExecutionInterruptedException(
169 "The execution of the \"" + getName() +
170 "\" sentence splitter has been abruptly interrupted!");
171 pListener = new IntervalProgressListener(11, 90);
172 transducer.addProgressListener(pListener);
173 transducer.addStatusListener(sListener);
174 Benchmark.executeWithBenchmarking(transducer,
175 Benchmark.createBenchmarkId("SentenceSplitterTransducer",
176 getBenchmarkId()), this, null);
177 transducer.removeProgressListener(pListener);
178 transducer.removeStatusListener(sListener);
179
180 //get pointers to the annotation sets
181 AnnotationSet inputAS = (inputASName == null) ?
182 document.getAnnotations() :
183 document.getAnnotations(inputASName);
184
185 AnnotationSet outputAS = (outputASName == null) ?
186 document.getAnnotations() :
187 document.getAnnotations(outputASName);
188
189 //copy the results to the output set if they are different
190 if(inputAS != outputAS){
191 outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
192 }
193
194 //create one big sentence if none were found
195 AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
196 if(sentences == null || sentences.isEmpty()){
197 //create an annotation covering the entire content
198 try{
199 outputAS.add(new Long(0), document.getContent().size(),
200 SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
201 }catch(InvalidOffsetException ioe){
202 throw new GateRuntimeException(ioe);
203 }
204 }else{
205 //add a sentence covering all the tokens after the last sentence
206 Long endSentences = sentences.lastNode().getOffset();
207 AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
208 inputAS.lastNode().getOffset());
209 if(remainingTokens != null && !remainingTokens.isEmpty()){
210 try{
211 outputAS.add(remainingTokens.firstNode().getOffset(),
212 remainingTokens.lastNode().getOffset(),
213 SENTENCE_ANNOTATION_TYPE,
214 Factory.newFeatureMap());
215 }catch(InvalidOffsetException ioe){
216 throw new ExecutionException(ioe);
217 }
218 }
219 }
220 fireProcessFinished();
221 }//execute()
222
223 /**
224 * Notifies all the PRs in this controller that they should stop their
225 * execution as soon as possible.
226 */
227 public synchronized void interrupt(){
228 interrupted = true;
229 gazetteer.interrupt();
230 transducer.interrupt();
231 }
232
233 public void setTransducerURL(java.net.URL newTransducerURL) {
234 transducerURL = newTransducerURL;
235 }
236 public java.net.URL getTransducerURL() {
237 return transducerURL;
238 }
239 DefaultGazetteer gazetteer;
240 Transducer transducer;
241 private java.net.URL transducerURL;
242 private String encoding;
243 private java.net.URL gazetteerListsURL;
244
245
246 public void setEncoding(String newEncoding) {
247 encoding = newEncoding;
248 }
249 public String getEncoding() {
250 return encoding;
251 }
252 public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
253 gazetteerListsURL = newGazetteerListsURL;
254 }
255 public java.net.URL getGazetteerListsURL() {
256 return gazetteerListsURL;
257 }
258 public void setInputASName(String newInputASName) {
259 inputASName = newInputASName;
260 }
261
262 public String getInputASName() {
263 return inputASName;
264 }
265 public void setOutputASName(String newOutputASName) {
266 outputASName = newOutputASName;
267 }
268 public String getOutputASName() {
269 return outputASName;
270 }
271
272 /* (non-Javadoc)
273 * @see gate.util.Benchmarkable#getBenchmarkId()
274 */
275 public String getBenchmarkId() {
276 if(benchmarkId == null) {
277 return getName();
278 }
279 else {
280 return benchmarkId;
281 }
282 }
283
284 /* (non-Javadoc)
285 * @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
286 */
287 public void setBenchmarkId(String benchmarkId) {
288 this.benchmarkId = benchmarkId;
289 }
290
291
292
293 private static final boolean DEBUG = false;
294 private String inputASName;
295 private String outputASName;
296 }//public class SentenceSplitter extends Nerc
|