001 /*
002 * Copyright (c) 1995-2010, The University of Sheffield. See the file
003 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004 *
005 * This file is part of GATE (see http://gate.ac.uk/), and is free
006 * software, licenced under the GNU Library General Public License,
007 * Version 2, June 1991 (in the distribution as file licence.html,
008 * and also available at http://gate.ac.uk/gate/licence.html).
009 *
010 * Valentin Tablan, 01 Feb 2000
011 *
012 * $Id: POSTagger.java 12483 2010-04-14 11:19:12Z johann_p $
013 */
014
015 package gate.creole;
016
017 import java.text.NumberFormat;
018 import java.util.*;
019
020 import gate.*;
021 import gate.creole.metadata.*;
022 import gate.util.GateRuntimeException;
023 import gate.util.OffsetComparator;
024 import org.apache.log4j.Logger;
025 import org.apache.log4j.Level;
026 /**
027 * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
028 */
029 @CreoleResource(name = "ANNIE POS Tagger",
030 helpURL = "http://gate.ac.uk/userguide/sec:annie:tagger",
031 comment = "Mark Hepple's Brill-style POS tagger")
032 public class POSTagger extends AbstractLanguageAnalyser {
033
034 public static final String
035 TAG_DOCUMENT_PARAMETER_NAME = "document";
036
037 public static final String
038 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
039
040 public static final String
041 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
042
043 public static final String
044 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
045
046 public static final String
047 TAG_ENCODING_PARAMETER_NAME = "encoding";
048
049
050 public static final String
051 BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType";
052
053 public static final String
054 OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = "outputAnnotationType";
055
056 public static final String
057 BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType";
058
059 public static final String
060 TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
061
062 @RunTime
063 @Optional
064 @CreoleParameter(
065 comment = "Throw and exception when there are none of the required input annotations",
066 defaultValue = "true")
067 public void setFailOnMissingInputAnnotations(Boolean fail) {
068 failOnMissingInputAnnotations = fail;
069 }
070 public Boolean getFailOnMissingInputAnnotations() {
071 return failOnMissingInputAnnotations;
072 }
073 protected Boolean failOnMissingInputAnnotations = true;
074
075 public POSTagger() {
076 }
077
078 protected Logger logger = Logger.getLogger(this.getClass().getName());
079
080 public Resource init()throws ResourceInstantiationException{
081 if(lexiconURL == null){
082 throw new ResourceInstantiationException(
083 "NoURL provided for the lexicon!");
084 }
085 if(rulesURL == null){
086 throw new ResourceInstantiationException(
087 "No URL provided for the rules!");
088 }
089 try{
090 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL, encoding);
091 }catch(Exception e){
092 throw new ResourceInstantiationException(e);
093 }
094 return this;
095 }
096
097
098 public void execute() throws ExecutionException{
099 //check the parameters
100 if(document == null) throw new ExecutionException(
101 "No document to process!");
102 if(inputASName != null && inputASName.equals("")) inputASName = null;
103 AnnotationSet inputAS = (inputASName == null) ?
104 document.getAnnotations() :
105 document.getAnnotations(inputASName);
106
107
108 if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length()==0) {
109 throw new ExecutionException("No base Token Annotation Type provided!");
110 }
111
112 if(outputASName != null && outputASName.equals("")) outputASName = null;
113 AnnotationSet outputAS = (outputASName == null) ?
114 document.getAnnotations() :
115 document.getAnnotations(outputASName);
116
117 if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length()==0) {
118 throw new ExecutionException("No base Sentence Annotation Type provided!");
119 }
120
121 if(outputAnnotationType == null || outputAnnotationType.trim().length()==0) {
122 throw new ExecutionException("No AnnotationType provided to store the new feature!");
123 }
124
125 AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
126 AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
127 if(sentencesAS != null && sentencesAS.size() > 0
128 && tokensAS != null && tokensAS.size() > 0){
129 long startTime = System.currentTimeMillis();
130 fireStatusChanged("POS tagging " + document.getName());
131 fireProgressChanged(0);
132 //prepare the input for HepTag
133 List sentenceForTagger = new ArrayList();
134 List sentencesForTagger = new ArrayList(1);
135 sentencesForTagger.add(sentenceForTagger);
136
137 //define a comparator for annotations by start offset
138 Comparator offsetComparator = new OffsetComparator();
139
140 //read all the tokens and all the sentences
141 List sentencesList = new ArrayList(sentencesAS);
142 Collections.sort(sentencesList, offsetComparator);
143 List tokensList = new ArrayList(tokensAS);
144 Collections.sort(tokensList, offsetComparator);
145
146 Iterator sentencesIter = sentencesList.iterator();
147 ListIterator tokensIter = tokensList.listIterator();
148
149 List tokensInCurrentSentence = new ArrayList();
150 Annotation currentToken = (Annotation)tokensIter.next();
151 int sentIndex = 0;
152 int sentCnt = sentencesAS.size();
153 while(sentencesIter.hasNext()){
154 Annotation currentSentence = (Annotation)sentencesIter.next();
155 tokensInCurrentSentence.clear();
156 sentenceForTagger.clear();
157 while(currentToken != null
158 &&
159 currentToken.getEndNode().getOffset().compareTo(
160 currentSentence.getEndNode().getOffset()) <= 0){
161 tokensInCurrentSentence.add(currentToken);
162 sentenceForTagger.add(currentToken.getFeatures().
163 get(TOKEN_STRING_FEATURE_NAME));
164 currentToken = (Annotation)(tokensIter.hasNext() ?
165 tokensIter.next() : null);
166 }
167 //run the POS tagger
168 List taggerList = tagger.runTagger(sentencesForTagger);
169 if(taggerList != null && taggerList.size() > 0){
170 List taggerResults = (List) taggerList.get(0);
171 //add the results
172 //make sure no malfunction occurred
173 if(taggerResults.size() != tokensInCurrentSentence.size())
174 throw new ExecutionException(
175 "POS Tagger malfunction: the output size (" +
176 taggerResults.size() +
177 ") is different from the input size (" +
178 tokensInCurrentSentence.size() + ")!");
179 Iterator resIter = taggerResults.iterator();
180 Iterator tokIter = tokensInCurrentSentence.iterator();
181 while(resIter.hasNext()){
182 Annotation annot = (Annotation) tokIter.next();
183 addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
184 }
185 }
186 fireProgressChanged(sentIndex++ * 100 / sentCnt);
187 }//while(sentencesIter.hasNext())
188
189 if(currentToken != null){
190 //we have remaining tokens after the last sentence
191 tokensInCurrentSentence.clear();
192 sentenceForTagger.clear();
193 while(currentToken != null){
194 tokensInCurrentSentence.add(currentToken);
195 sentenceForTagger.add(currentToken.getFeatures().
196 get(TOKEN_STRING_FEATURE_NAME));
197 currentToken = (Annotation)(tokensIter.hasNext() ?
198 tokensIter.next() : null);
199 }
200 //run the POS tagger
201 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
202 //add the results
203 //make sure no malfunction occurred
204 if(taggerResults.size() != tokensInCurrentSentence.size())
205 throw new ExecutionException(
206 "POS Tagger malfunction: the output size (" +
207 taggerResults.size() +
208 ") is different from the input size (" +
209 tokensInCurrentSentence.size() + ")!");
210 Iterator resIter = taggerResults.iterator();
211 Iterator tokIter = tokensInCurrentSentence.iterator();
212 while(resIter.hasNext()){
213 Annotation annot = (Annotation) tokIter.next();
214 addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
215 }
216 }//if(currentToken != null)
217 fireProcessFinished();
218 fireStatusChanged(
219 document.getName() + " tagged in " +
220 NumberFormat.getInstance().format(
221 (double)(System.currentTimeMillis() - startTime) / 1000) +
222 " seconds!");
223 }else{
224 if(failOnMissingInputAnnotations) {
225 throw new ExecutionException("No sentences or tokens to process in document "+document.getName()+"\n" +
226 "Please run a sentence splitter "+
227 "and tokeniser first!");
228 } else {
229 Utils.logOnce(logger,Level.INFO,"POS tagger: no sentence or token annotations in input document - see debug log for details.");
230 logger.debug("No input annotations in document "+document.getName());
231 }
232 }
233
234 //OLD version
235 /*
236 AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
237 if(as != null && as.size() > 0){
238 List sentences = new ArrayList(as);
239 Collections.sort(sentences, offsetComparator);
240 Iterator sentIter = sentences.iterator();
241 int sentIndex = 0;
242 int sentCnt = sentences.size();
243 long startTime= System.currentTimeMillis();
244 while(sentIter.hasNext()){
245 start = System.currentTimeMillis();
246 Annotation sentenceAnn = (Annotation)sentIter.next();
247 AnnotationSet rangeSet = inputAS.get(
248 sentenceAnn.getStartNode().getOffset(),
249 sentenceAnn.getEndNode().getOffset());
250 if(rangeSet == null) continue;
251 AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
252 if(tokensSet == null) continue;
253 List tokens = new ArrayList(tokensSet);
254 Collections.sort(tokens, offsetComparator);
255
256 // List tokens = (List)sentenceAnn.getFeatures().get("tokens");
257 List sentence = new ArrayList(tokens.size());
258 Iterator tokIter = tokens.iterator();
259 while(tokIter.hasNext()){
260 Annotation token = (Annotation)tokIter.next();
261 String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
262 sentence.add(text);
263 }//while(tokIter.hasNext())
264
265 //run the POSTagger over this sentence
266 List sentences4tagger = new ArrayList(1);
267 sentences4tagger.add(sentence);
268 prepTime += System.currentTimeMillis() - start;
269 start = System.currentTimeMillis();
270 List taggerResults = tagger.runTagger(sentences4tagger);
271 posTime += System.currentTimeMillis() - start;
272 start = System.currentTimeMillis();
273 //add the results to the output annotation set
274 //we only get one sentence
275 List sentenceFromTagger = (List)taggerResults.get(0);
276 if(sentenceFromTagger.size() != sentence.size()){
277 String taggerResult = "";
278 for(int i = 0; i< sentenceFromTagger.size(); i++){
279 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
280 }
281 throw new GateRuntimeException(
282 "POS Tagger malfunction: the output size (" +
283 sentenceFromTagger.size() +
284 ") is different from the input size (" +
285 sentence.size() + ")!" +
286 "\n Input: " + sentence + "\nOutput: " + taggerResult);
287 }
288 for(int i = 0; i< sentence.size(); i++){
289 String category = ((String[])sentenceFromTagger.get(i))[1];
290 Annotation token = (Annotation)tokens.get(i);
291 token.getFeatures().
292 put(TOKEN_CATEGORY_FEATURE_NAME, category);
293 }//for(i = 0; i<= sentence.size(); i++)
294 postTime += System.currentTimeMillis() - start;
295 fireProgressChanged(sentIndex++ * 100 / sentCnt);
296 }//while(sentIter.hasNext())
297 Out.prln("POS preparation time:" + prepTime);
298 Out.prln("POS execution time:" + posTime);
299 Out.prln("POS after execution time:" + postTime);
300 fireProcessFinished();
301 long endTime = System.currentTimeMillis();
302 fireStatusChanged(document.getName() + " tagged in " +
303 NumberFormat.getInstance().format(
304 (double)(endTime - startTime) / 1000) + " seconds!");
305 }else{
306 throw new GateRuntimeException("No sentences to process!\n" +
307 "Please run a sentence splitter first!");
308 }//if(as != null && as.size() > 0)
309 */
310 }
311
312
313 protected void addFeatures(Annotation annot, String featureName, String featureValue) throws GateRuntimeException {
314 String tempIASN = inputASName == null ? "" : inputASName;
315 String tempOASN = outputASName == null ? "" : outputASName;
316 if(outputAnnotationType.equals(baseTokenAnnotationType) && tempIASN.equals(tempOASN)) {
317 annot.getFeatures().put(featureName, featureValue);
318 return;
319 } else {
320 int start = annot.getStartNode().getOffset().intValue();
321 int end = annot.getEndNode().getOffset().intValue();
322
323 // get the annotations of type outputAnnotationType
324 AnnotationSet outputAS = (outputASName == null) ?
325 document.getAnnotations() :
326 document.getAnnotations(outputASName);
327 AnnotationSet annotations = outputAS.get(outputAnnotationType);
328 if(annotations == null || annotations.size() == 0) {
329 // add new annotation
330 FeatureMap features = Factory.newFeatureMap();
331 features.put(featureName, featureValue);
332 try {
333 outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
334 } catch(Exception e) {
335 throw new GateRuntimeException("Invalid Offsets");
336 }
337 } else {
338 // search for the annotation if there is one with the same start and end offsets
339 ArrayList tempList = new ArrayList(annotations.get());
340 boolean found = false;
341 for(int i=0;i<tempList.size();i++) {
342 Annotation annotation = (Annotation) tempList.get(i);
343 if(annotation.getStartNode().getOffset().intValue() == start && annotation.getEndNode().getOffset().intValue() == end) {
344 // this is the one
345 annotation.getFeatures().put(featureName, featureValue);
346 found = true;
347 break;
348 }
349 }
350
351 if(!found) {
352 // add new annotation
353 FeatureMap features = Factory.newFeatureMap();
354 features.put(featureName, featureValue);
355 try {
356 outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
357 } catch(Exception e) {
358 throw new GateRuntimeException("Invalid Offsets");
359 }
360 }
361 }
362 }
363 }
364
365 public void setLexiconURL(java.net.URL newLexiconURL) {
366 lexiconURL = newLexiconURL;
367 }
368 public java.net.URL getLexiconURL() {
369 return lexiconURL;
370 }
371 public void setRulesURL(java.net.URL newRulesURL) {
372 rulesURL = newRulesURL;
373 }
374 public void setEncoding(String encoding) {
375 this.encoding = encoding;
376 }
377
378 public java.net.URL getRulesURL() {
379 return rulesURL;
380 }
381 public void setInputASName(String newInputASName) {
382 inputASName = newInputASName;
383 }
384 public String getInputASName() {
385 return inputASName;
386 }
387 public String getEncoding() {
388 return this.encoding;
389 }
390
391 public String getBaseTokenAnnotationType() {
392 return this.baseTokenAnnotationType;
393 }
394
395 public String getBaseSentenceAnnotationType() {
396 return this.baseSentenceAnnotationType;
397 }
398
399 public String getOutputAnnotationType() {
400 return this.outputAnnotationType;
401 }
402
403 public void setBaseTokenAnnotationType(String baseTokenAnnotationType) {
404 this.baseTokenAnnotationType = baseTokenAnnotationType;
405 }
406
407 public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) {
408 this.baseSentenceAnnotationType = baseSentenceAnnotationtype;
409 }
410
411 public void setOutputAnnotationType(String outputAnnotationType) {
412 this.outputAnnotationType = outputAnnotationType;
413 }
414
415 public String getOutputASName() {
416 return this.outputASName;
417 }
418
419 public void setOutputASName(String outputASName) {
420 this.outputASName = outputASName;
421 }
422
423 protected hepple.postag.POSTagger tagger;
424 private java.net.URL lexiconURL;
425 private java.net.URL rulesURL;
426 private String inputASName;
427 private String encoding;
428 private String baseTokenAnnotationType;
429 private String baseSentenceAnnotationType;
430 private String outputAnnotationType;
431 private String outputASName;
432 }
|