001 /*
002 * Copyright (c) 1995-2010, The University of Sheffield. See the file
003 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004 *
005 * This file is part of GATE (see http://gate.ac.uk/), and is free
006 * software, licenced under the GNU Library General Public License,
007 * Version 2, June 1991 (in the distribution as file licence.html,
008 * and also available at http://gate.ac.uk/gate/licence.html).
009 *
010 * Valentin Tablan, 04 Sep 2007
011 *
012 * $Id: RegexSentenceSplitter.java 12919 2010-08-03 10:31:37Z valyt $
013 */
014 package gate.creole.splitter;
015
016 import java.io.*;
017 import java.net.MalformedURLException;
018 import java.net.URL;
019 import java.util.*;
020 import java.util.regex.*;
021
022 import gate.*;
023 import gate.creole.*;
024 import gate.util.*;
025
026 /**
027 * A fast sentence splitter replacement based on regular expressions.
028 */
029 public class RegexSentenceSplitter extends AbstractLanguageAnalyser {
030
031 /**
032 * Parameter name
033 */
034 public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";
035
036 /**
037 * Parameter name
038 */
039 public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
040
041 /**
042 * Parameter name
043 */
044 public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
045
046 /**
047 * Parameter name
048 */
049 public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";
050
051 /**
052 * Parameter name
053 */
054 public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";
055
056
057 /**
058 * Parameter name
059 */
060 public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";
061
062 /**
063 * serialisation ID
064 */
065 private static final long serialVersionUID = 1L;
066
067 /**
068 * The document to be processed
069 */
070 protected Document document;
071
072 /**
073 * Output annotation set name.
074 */
075 protected String outputASName;
076
077 /**
078 * Encoding used when reading config files
079 */
080 protected String encoding;
081
082 /**
083 * URL pointing to a file with regex patterns for internal sentence splits.
084 */
085 protected URL internalSplitListURL;
086
087 /**
088 * URL pointing to a file with regex patterns for external sentence splits.
089 */
090 protected URL externalSplitListURL;
091
092 /**
093 * URL pointing to a file with regex patterns for non sentence splits.
094 */
095 protected URL nonSplitListURL;
096
097
098 protected Pattern internalSplitsPattern;
099
100 protected Pattern externalSplitsPattern;
101
102 protected Pattern nonSplitsPattern;
103
104 protected Pattern compilePattern(URL paternsListUrl, String encoding)
105 throws UnsupportedEncodingException, IOException{
106 BufferedReader reader = new BomStrippingInputStreamReader(paternsListUrl.openStream(), encoding);
107 StringBuffer patternString = new StringBuffer();
108
109 String line = reader.readLine();
110 while(line != null){
111 line = line.trim();
112
113 if(line.length() == 0 || line.startsWith("//")){
114 //ignore empty lines and comments
115 }else{
116 if(patternString.length() > 0) patternString.append("|");
117 patternString.append("(?:" + line + ")");
118 }
119 //move to next line
120 line = reader.readLine();
121 }
122 return Pattern.compile(patternString.toString());
123 }
124
125
126 // protected enum StartEnd {START, END};
127
128 /**
129 * A comparator for MatchResult objects. This is used to find the next match
130 * result in a text. A null value is used to signify that no more matches are
131 * available, hence nulls are the largest value, according to this comparator.
132 * @author Valentin Tablan (valyt)
133 */
134 private class MatchResultComparator implements Comparator<MatchResult>{
135
136 /* (non-Javadoc)
137 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
138 */
139 public int compare(MatchResult o1, MatchResult o2) {
140 if(o1 == null && o2 == null) return 0;
141 if(o1 == null) return 1;
142 if(o2 == null) return -1;
143 //at this point both match results are not null
144 return o1.start() - o2.start();
145 }
146 }
147
148 @Override
149 public void execute() throws ExecutionException {
150 interrupted = false;
151 int lastProgress = 0;
152 fireProgressChanged(lastProgress);
153 //get pointers to the annotation sets
154 AnnotationSet outputAS = (outputASName == null ||
155 outputASName.trim().length() == 0) ?
156 document.getAnnotations() :
157 document.getAnnotations(outputASName);
158
159 String docText = document.getContent().toString();
160
161 /* If the document's content is empty or contains only whitespace,
162 * we drop out right here, since there's nothing to sentence-split. */
163 if (docText.trim().length() < 1) {
164 return;
165 }
166
167 Matcher internalSplitMatcher = internalSplitsPattern.matcher(docText);
168 Matcher externalSplitMatcher = externalSplitsPattern.matcher(docText);
169
170 Matcher nonSplitMatcher = nonSplitsPattern.matcher(docText);
171 //store all non split locations in a list of pairs
172 List<int[]> nonSplits = new LinkedList<int[]>();
173 while(nonSplitMatcher.find()){
174 nonSplits.add(new int[]{nonSplitMatcher.start(), nonSplitMatcher.end()});
175 }
176 //this lists holds the next matches at each step
177 List<MatchResult> nextSplitMatches = new ArrayList<MatchResult>();
178 //initialise matching process
179 MatchResult internalMatchResult = null;
180 if(internalSplitMatcher.find()){
181 internalMatchResult = internalSplitMatcher.toMatchResult();
182 nextSplitMatches.add(internalMatchResult);
183 }
184 MatchResult externalMatchResult = null;
185 if(externalSplitMatcher.find()){
186 externalMatchResult = externalSplitMatcher.toMatchResult();
187 nextSplitMatches.add(externalMatchResult);
188 }
189 MatchResultComparator comparator = new MatchResultComparator();
190 int lastSentenceEnd = 0;
191
192 while(!nextSplitMatches.isEmpty()){
193 //see which one matches first
194 Collections.sort(nextSplitMatches, comparator);
195 MatchResult nextMatch = nextSplitMatches.remove(0);
196 if(nextMatch == internalMatchResult){
197 //we have a new internal split; see if it's vetoed or not
198 if(!veto(nextMatch, nonSplits)){
199 //split is not vetoed
200 try {
201 //add the split annotation
202 FeatureMap features = Factory.newFeatureMap();
203 features.put("kind", "internal");
204 outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()),
205 "Split", features);
206 //generate the sentence annotation
207 int endOffset = nextMatch.end();
208 //find the first non whitespace character starting from where the
209 //last sentence ended
210 while(lastSentenceEnd < endOffset &&
211 Character.isWhitespace(
212 Character.codePointAt(docText, lastSentenceEnd))){
213 lastSentenceEnd++;
214 }
215 //if there is any useful text between the two offsets, generate
216 //a new sentence
217 if(lastSentenceEnd < nextMatch.start()){
218 outputAS.add(new Long(lastSentenceEnd), new Long(endOffset),
219 ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
220 Factory.newFeatureMap());
221 }
222 //store the new sentence end
223 lastSentenceEnd = endOffset;
224 } catch(InvalidOffsetException e) {
225 // this should never happen
226 throw new ExecutionException(e);
227 }
228 }
229 //prepare for next step
230 if(internalSplitMatcher.find()){
231 internalMatchResult = internalSplitMatcher.toMatchResult();
232 nextSplitMatches.add(internalMatchResult);
233 }else{
234 internalMatchResult = null;
235 }
236 }else if(nextMatch == externalMatchResult){
237 //we have a new external split; see if it's vetoed or not
238 if(!veto(nextMatch, nonSplits)){
239 //split is not vetoed
240 try {
241 //generate the split
242 FeatureMap features = Factory.newFeatureMap();
243 features.put("kind", "external");
244 outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()),
245 "Split", features);
246 //generate the sentence annotation
247 //find the last non whitespace character, going backward from
248 //where the external skip starts
249 int endOffset = nextMatch.start();
250 while(endOffset > lastSentenceEnd &&
251 Character.isSpaceChar(
252 Character.codePointAt(docText, endOffset -1))){
253 endOffset--;
254 }
255 //find the first non whitespace character starting from where the
256 //last sentence ended
257 while(lastSentenceEnd < endOffset &&
258 Character.isSpaceChar(
259 Character.codePointAt(docText, lastSentenceEnd))){
260 lastSentenceEnd++;
261 }
262 //if there is any useful text between the two offsets, generate
263 //a new sentence
264 if(lastSentenceEnd < endOffset){
265 outputAS.add(new Long(lastSentenceEnd), new Long(endOffset),
266 ANNIEConstants.SENTENCE_ANNOTATION_TYPE,
267 Factory.newFeatureMap());
268 }
269 //store the new sentence end
270 lastSentenceEnd = nextMatch.end();
271 } catch(InvalidOffsetException e) {
272 // this should never happen
273 throw new ExecutionException(e);
274 }
275 }
276 //prepare for next step
277 if(externalSplitMatcher.find()){
278 externalMatchResult = externalSplitMatcher.toMatchResult();
279 nextSplitMatches.add(externalMatchResult);
280 }else{
281 externalMatchResult = null;
282 }
283 }else{
284 //malfunction
285 throw new ExecutionException("Invalid state - cannot identify match!");
286 }
287 //report progress
288 int newProgress = 100 * lastSentenceEnd / docText.length();
289 if(newProgress - lastProgress > 20){
290 lastProgress = newProgress;
291 fireProgressChanged(lastProgress);
292 }
293 }//while(!nextMatches.isEmpty()){
294 fireProcessFinished();
295 }
296
297
298 /**
299 * Checks whether a possible match is being vetoed by a non split match. A
300 * possible match is vetoed if it any nay overlap with a veto region.
301 *
302 * @param split the match result representing the split to be tested
303 * @param vetoRegions regions where matches are not allowed. For efficiency
304 * reasons, this method assumes these regions to be non overlapping and sorted
305 * in ascending order.
306 * All veto regions that end before the proposed match are also discarded
307 * (again for efficiency reasons). This requires the proposed matches to be
308 * sent to this method in ascending order, so as to avoid malfunctions.
309 * @return <tt>true</tt> iff the proposed split should be ignored
310 */
311 private boolean veto(MatchResult split, List<int[]> vetoRegions){
312 //if no more non splits available, accept everything
313 for(Iterator<int[]> vetoRegIter = vetoRegions.iterator();
314 vetoRegIter.hasNext();){
315 int[] aVetoRegion = vetoRegIter.next();
316 if(aVetoRegion[1] -1 < split.start()){
317 //current veto region ends before the proposed split starts
318 //--> discard the veto region
319 vetoRegIter.remove();
320 }else if(split.end() -1 < aVetoRegion[0]){
321 //veto region starts after the split ends
322 //-> we can return false
323 return false;
324 }else{
325 //we have overlap
326 return true;
327 }
328 }
329 //if we got this far, all veto regions are before the split
330 return false;
331 }
332
333 @Override
334 public Resource init() throws ResourceInstantiationException {
335 super.init();
336 try {
337 //sanity checks
338 if(internalSplitListURL == null)
339 throw new ResourceInstantiationException("No list of internal splits provided!");
340 if(externalSplitListURL == null)
341 throw new ResourceInstantiationException("No list of external splits provided!");
342 if(nonSplitListURL == null)
343 throw new ResourceInstantiationException("No list of non splits provided!");
344 if(encoding == null)
345 throw new ResourceInstantiationException("No encoding provided!");
346
347 //load the known abbreviations list
348 internalSplitsPattern = compilePattern(internalSplitListURL, encoding);
349 externalSplitsPattern = compilePattern(externalSplitListURL, encoding);
350 nonSplitsPattern = compilePattern(nonSplitListURL, encoding);
351 } catch(UnsupportedEncodingException e) {
352 throw new ResourceInstantiationException(e);
353 } catch(IOException e) {
354 throw new ResourceInstantiationException(e);
355 }
356
357 return this;
358 }
359
360 /**
361 * @return the document
362 */
363 public Document getDocument() {
364 return document;
365 }
366
367 /**
368 * @param document the document to set
369 */
370 public void setDocument(Document document) {
371 this.document = document;
372 }
373
374 /**
375 * @return the outputASName
376 */
377 public String getOutputASName() {
378 return outputASName;
379 }
380
381 /**
382 * @param outputASName the outputASName to set
383 */
384 public void setOutputASName(String outputASName) {
385 this.outputASName = outputASName;
386 }
387
388 /**
389 * @return the encoding
390 */
391 public String getEncoding() {
392 return encoding;
393 }
394
395 /**
396 * @param encoding the encoding to set
397 */
398 public void setEncoding(String encoding) {
399 this.encoding = encoding;
400 }
401
402 /**
403 * @return the internalSplitListURL
404 */
405 public URL getInternalSplitListURL() {
406 return internalSplitListURL;
407 }
408
409 /**
410 * @param internalSplitListURL the internalSplitListURL to set
411 */
412 public void setInternalSplitListURL(URL internalSplitListURL) {
413 this.internalSplitListURL = internalSplitListURL;
414 }
415
416 /**
417 * @return the externalSplitListURL
418 */
419 public URL getExternalSplitListURL() {
420 return externalSplitListURL;
421 }
422
423 /**
424 * @param externalSplitListURL the externalSplitListURL to set
425 */
426 public void setExternalSplitListURL(URL externalSplitListURL) {
427 this.externalSplitListURL = externalSplitListURL;
428 }
429
430 /**
431 * @return the nonSplitListURL
432 */
433 public URL getNonSplitListURL() {
434 return nonSplitListURL;
435 }
436
437 /**
438 * @param nonSplitListURL the nonSplitListURL to set
439 */
440 public void setNonSplitListURL(URL nonSplitListURL) {
441 this.nonSplitListURL = nonSplitListURL;
442 }
443
444 /**
445 * @return the internalSplitsPattern
446 */
447 public Pattern getInternalSplitsPattern() {
448 return internalSplitsPattern;
449 }
450
451 /**
452 * @param internalSplitsPattern the internalSplitsPattern to set
453 */
454 public void setInternalSplitsPattern(Pattern internalSplitsPattern) {
455 this.internalSplitsPattern = internalSplitsPattern;
456 }
457 }
|