POSTagger.java
001 /*
002  *  Copyright (c) 1995-2010, The University of Sheffield. See the file
003  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004  *
005  *  This file is part of GATE (see http://gate.ac.uk/), and is free
006  *  software, licenced under the GNU Library General Public License,
007  *  Version 2, June 1991 (in the distribution as file licence.html,
008  *  and also available at http://gate.ac.uk/gate/licence.html).
009  *
010  *  Valentin Tablan, 01 Feb 2000
011  *
012  *  $Id: POSTagger.java 12483 2010-04-14 11:19:12Z johann_p $
013  */
014 
015 package gate.creole;
016 
017 import java.text.NumberFormat;
018 import java.util.*;
019 
020 import gate.*;
021 import gate.creole.metadata.*;
022 import gate.util.GateRuntimeException;
023 import gate.util.OffsetComparator;
024 import org.apache.log4j.Logger;
025 import org.apache.log4j.Level;
026 /**
027  * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
028  */
029 @CreoleResource(name = "ANNIE POS Tagger",
030         helpURL = "http://gate.ac.uk/userguide/sec:annie:tagger",
031         comment = "Mark Hepple's Brill-style POS tagger")
032 public class POSTagger extends AbstractLanguageAnalyser {
033 
034   public static final String
035     TAG_DOCUMENT_PARAMETER_NAME = "document";
036 
037   public static final String
038     TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
039 
040   public static final String
041     TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
042 
043   public static final String
044     TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
045 
046   public static final String
047       TAG_ENCODING_PARAMETER_NAME = "encoding";
048 
049   
050   public static final String
051     BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType";
052 
053   public static final String
054   OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = "outputAnnotationType";
055   
056   public static final String
057   BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType";
058 
059   public static final String
060     TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
061 
062   @RunTime
063   @Optional
064   @CreoleParameter(
065     comment = "Throw and exception when there are none of the required input annotations",
066     defaultValue = "true")  
067   public void setFailOnMissingInputAnnotations(Boolean fail) {
068     failOnMissingInputAnnotations = fail;
069   }
070   public Boolean getFailOnMissingInputAnnotations() {
071     return failOnMissingInputAnnotations;
072   }
073   protected Boolean failOnMissingInputAnnotations = true;
074   
075   public POSTagger() {
076   }
077 
078   protected Logger logger = Logger.getLogger(this.getClass().getName());
079   
080   public Resource init()throws ResourceInstantiationException{
081     if(lexiconURL == null){
082       throw new ResourceInstantiationException(
083         "NoURL provided for the lexicon!");
084     }
085     if(rulesURL == null){
086       throw new ResourceInstantiationException(
087         "No URL provided for the rules!");
088     }
089     try{
090       tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL, encoding);
091     }catch(Exception e){
092       throw new ResourceInstantiationException(e);
093     }
094     return this;
095   }
096 
097 
098   public void execute() throws ExecutionException{
099     //check the parameters
100     if(document == nullthrow new ExecutionException(
101       "No document to process!");
102     if(inputASName != null && inputASName.equals("")) inputASName = null;
103     AnnotationSet inputAS = (inputASName == null?
104                             document.getAnnotations() :
105                             document.getAnnotations(inputASName);
106 
107                            
108     if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length()==0) {
109         throw new ExecutionException("No base Token Annotation Type provided!");
110     }
111 
112     if(outputASName != null && outputASName.equals("")) outputASName = null;
113     AnnotationSet outputAS = (outputASName == null?
114                             document.getAnnotations() :
115                             document.getAnnotations(outputASName);
116     
117     if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length()==0) {
118         throw new ExecutionException("No base Sentence Annotation Type provided!");
119     }
120     
121     if(outputAnnotationType == null || outputAnnotationType.trim().length()==0) {
122         throw new ExecutionException("No AnnotationType provided to store the new feature!");
123     }
124     
125     AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
126     AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
127     if(sentencesAS != null && sentencesAS.size() 0
128        && tokensAS != null && tokensAS.size() 0){
129       long startTime = System.currentTimeMillis();
130       fireStatusChanged("POS tagging " + document.getName());
131       fireProgressChanged(0);
132       //prepare the input for HepTag
133       List sentenceForTagger = new ArrayList();
134       List sentencesForTagger = new ArrayList(1);
135       sentencesForTagger.add(sentenceForTagger);
136 
137       //define a comparator for annotations by start offset
138       Comparator offsetComparator = new OffsetComparator();
139 
140       //read all the tokens and all the sentences
141       List sentencesList = new ArrayList(sentencesAS);
142       Collections.sort(sentencesList, offsetComparator);
143       List tokensList = new ArrayList(tokensAS);
144       Collections.sort(tokensList, offsetComparator);
145 
146       Iterator sentencesIter = sentencesList.iterator();
147       ListIterator tokensIter = tokensList.listIterator();
148 
149       List tokensInCurrentSentence = new ArrayList();
150       Annotation currentToken = (Annotation)tokensIter.next();
151       int sentIndex = 0;
152       int sentCnt = sentencesAS.size();
153       while(sentencesIter.hasNext()){
154         Annotation currentSentence = (Annotation)sentencesIter.next();
155         tokensInCurrentSentence.clear();
156         sentenceForTagger.clear();
157         while(currentToken != null
158               &&
159               currentToken.getEndNode().getOffset().compareTo(
160               currentSentence.getEndNode().getOffset()) <= 0){
161           tokensInCurrentSentence.add(currentToken);
162           sentenceForTagger.add(currentToken.getFeatures().
163                                 get(TOKEN_STRING_FEATURE_NAME));
164           currentToken = (Annotation)(tokensIter.hasNext() ?
165                                      tokensIter.next() null);
166         }
167         //run the POS tagger
168         List taggerList = tagger.runTagger(sentencesForTagger);
169         if(taggerList != null && taggerList.size() 0){
170           List taggerResults = (ListtaggerList.get(0);
171           //add the results
172           //make sure no malfunction occurred
173           if(taggerResults.size() != tokensInCurrentSentence.size())
174             throw new ExecutionException(
175                 "POS Tagger malfunction: the output size (" +
176                 taggerResults.size() +
177                 ") is different from the input size (" +
178                 tokensInCurrentSentence.size() ")!");
179           Iterator resIter = taggerResults.iterator();
180           Iterator tokIter = tokensInCurrentSentence.iterator();
181           while(resIter.hasNext()){
182               Annotation annot = (AnnotationtokIter.next();
183               addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
184           }
185         }
186         fireProgressChanged(sentIndex++ * 100 / sentCnt);
187       }//while(sentencesIter.hasNext())
188 
189       if(currentToken != null){
190         //we have remaining tokens after the last sentence
191         tokensInCurrentSentence.clear();
192         sentenceForTagger.clear();
193         while(currentToken != null){
194           tokensInCurrentSentence.add(currentToken);
195           sentenceForTagger.add(currentToken.getFeatures().
196                                 get(TOKEN_STRING_FEATURE_NAME));
197           currentToken = (Annotation)(tokensIter.hasNext() ?
198                                       tokensIter.next() null);
199         }
200         //run the POS tagger
201         List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
202         //add the results
203         //make sure no malfunction occurred
204         if(taggerResults.size() != tokensInCurrentSentence.size())
205           throw new ExecutionException(
206               "POS Tagger malfunction: the output size (" +
207               taggerResults.size() +
208               ") is different from the input size (" +
209               tokensInCurrentSentence.size() ")!");
210         Iterator resIter = taggerResults.iterator();
211         Iterator tokIter = tokensInCurrentSentence.iterator();
212         while(resIter.hasNext()){
213             Annotation annot = (AnnotationtokIter.next();
214             addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
215         }
216       }//if(currentToken != null)
217       fireProcessFinished();
218       fireStatusChanged(
219         document.getName() " tagged in " +
220         NumberFormat.getInstance().format(
221         (double)(System.currentTimeMillis() - startTime1000+
222         " seconds!");
223     }else{
224       if(failOnMissingInputAnnotations) {
225         throw new ExecutionException("No sentences or tokens to process in document "+document.getName()+"\n" +
226                                      "Please run a sentence splitter "+
227                                      "and tokeniser first!");
228       else {
229         Utils.logOnce(logger,Level.INFO,"POS tagger: no sentence or token annotations in input document - see debug log for details.");
230         logger.debug("No input annotations in document "+document.getName());
231       }
232     }
233 
234 //OLD version
235 /*
236     AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
237     if(as != null && as.size() > 0){
238       List sentences = new ArrayList(as);
239       Collections.sort(sentences, offsetComparator);
240       Iterator sentIter = sentences.iterator();
241       int sentIndex = 0;
242       int sentCnt = sentences.size();
243       long startTime= System.currentTimeMillis();
244       while(sentIter.hasNext()){
245 start = System.currentTimeMillis();
246         Annotation sentenceAnn = (Annotation)sentIter.next();
247         AnnotationSet rangeSet = inputAS.get(
248                                   sentenceAnn.getStartNode().getOffset(),
249                                   sentenceAnn.getEndNode().getOffset());
250         if(rangeSet == null) continue;
251         AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
252         if(tokensSet == null) continue;
253         List tokens = new ArrayList(tokensSet);
254         Collections.sort(tokens, offsetComparator);
255 
256 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
257         List sentence = new ArrayList(tokens.size());
258         Iterator tokIter = tokens.iterator();
259         while(tokIter.hasNext()){
260           Annotation token = (Annotation)tokIter.next();
261           String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
262           sentence.add(text);
263         }//while(tokIter.hasNext())
264 
265         //run the POSTagger over this sentence
266         List sentences4tagger = new ArrayList(1);
267         sentences4tagger.add(sentence);
268 prepTime += System.currentTimeMillis() - start;
269 start = System.currentTimeMillis();
270         List taggerResults = tagger.runTagger(sentences4tagger);
271 posTime += System.currentTimeMillis() - start;
272 start = System.currentTimeMillis();
273         //add the results to the output annotation set
274         //we only get one sentence
275         List sentenceFromTagger = (List)taggerResults.get(0);
276         if(sentenceFromTagger.size() != sentence.size()){
277           String taggerResult = "";
278           for(int i = 0; i< sentenceFromTagger.size(); i++){
279             taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
280           }
281           throw new GateRuntimeException(
282             "POS Tagger malfunction: the output size (" +
283             sentenceFromTagger.size() +
284             ") is different from the input size (" +
285             sentence.size() + ")!" +
286             "\n Input: " + sentence + "\nOutput: " + taggerResult);
287         }
288         for(int i = 0; i< sentence.size(); i++){
289           String category = ((String[])sentenceFromTagger.get(i))[1];
290           Annotation token = (Annotation)tokens.get(i);
291           token.getFeatures().
292             put(TOKEN_CATEGORY_FEATURE_NAME, category);
293         }//for(i = 0; i<= sentence.size(); i++)
294 postTime += System.currentTimeMillis() - start;
295         fireProgressChanged(sentIndex++ * 100 / sentCnt);
296       }//while(sentIter.hasNext())
297 Out.prln("POS preparation time:" + prepTime);
298 Out.prln("POS execution time:" + posTime);
299 Out.prln("POS after execution time:" + postTime);
300         fireProcessFinished();
301         long endTime = System.currentTimeMillis();
302         fireStatusChanged(document.getName() + " tagged in " +
303                         NumberFormat.getInstance().format(
304                         (double)(endTime - startTime) / 1000) + " seconds!");
305     }else{
306       throw new GateRuntimeException("No sentences to process!\n" +
307                                      "Please run a sentence splitter first!");
308     }//if(as != null && as.size() > 0)
309 */
310   }
311 
312 
313   protected void addFeatures(Annotation annot, String featureName, String featureValuethrows GateRuntimeException {
314       String tempIASN = inputASName == null "" : inputASName;
315       String tempOASN = outputASName == null "" : outputASName;
316       if(outputAnnotationType.equals(baseTokenAnnotationType&& tempIASN.equals(tempOASN)) {
317           annot.getFeatures().put(featureName, featureValue);
318           return;
319       else {
320           int start = annot.getStartNode().getOffset().intValue();
321           int end = annot.getEndNode().getOffset().intValue();
322           
323           // get the annotations of type outputAnnotationType
324           AnnotationSet outputAS = (outputASName == null?
325                   document.getAnnotations() :
326                   document.getAnnotations(outputASName);
327           AnnotationSet annotations = outputAS.get(outputAnnotationType);
328           if(annotations == null || annotations.size() == 0) {
329               // add new annotation
330               FeatureMap features = Factory.newFeatureMap();
331               features.put(featureName, featureValue);
332               try {
333                   outputAS.add(new Long(start)new Long(end), outputAnnotationType, features);
334               catch(Exception e) {
335                   throw new GateRuntimeException("Invalid Offsets");
336               }
337           else {
338               // search for the annotation if there is one with the same start and end offsets
339               ArrayList tempList = new ArrayList(annotations.get());
340               boolean found = false;
341               for(int i=0;i<tempList.size();i++) {
342                   Annotation annotation = (AnnotationtempList.get(i);
343                   if(annotation.getStartNode().getOffset().intValue() == start && annotation.getEndNode().getOffset().intValue() == end) {
344                       // this is the one
345                       annotation.getFeatures().put(featureName, featureValue);
346                       found = true;
347                       break;
348                   }
349               }
350               
351               if(!found) {
352                   // add new annotation
353                   FeatureMap features = Factory.newFeatureMap();
354                   features.put(featureName, featureValue);
355                   try {
356                       outputAS.add(new Long(start)new Long(end), outputAnnotationType, features);
357                   catch(Exception e) {
358                       throw new GateRuntimeException("Invalid Offsets");
359                   }
360               }
361           }
362       }
363   }
364   
365   public void setLexiconURL(java.net.URL newLexiconURL) {
366     lexiconURL = newLexiconURL;
367   }
368   public java.net.URL getLexiconURL() {
369     return lexiconURL;
370   }
371   public void setRulesURL(java.net.URL newRulesURL) {
372     rulesURL = newRulesURL;
373   }
374   public void setEncoding(String encoding) {
375     this.encoding = encoding;
376   }
377 
378   public java.net.URL getRulesURL() {
379     return rulesURL;
380   }
381   public void setInputASName(String newInputASName) {
382     inputASName = newInputASName;
383   }
384   public String getInputASName() {
385     return inputASName;
386   }
387   public String getEncoding() {
388     return this.encoding;
389   }
390 
391   public String getBaseTokenAnnotationType() {
392       return this.baseTokenAnnotationType;
393   }
394   
395   public String getBaseSentenceAnnotationType() {
396       return this.baseSentenceAnnotationType;
397   }
398   
399   public String getOutputAnnotationType() {
400       return this.outputAnnotationType;
401   }
402   
403   public void setBaseTokenAnnotationType(String baseTokenAnnotationType) {
404       this.baseTokenAnnotationType = baseTokenAnnotationType;
405   }
406   
407   public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) {
408       this.baseSentenceAnnotationType = baseSentenceAnnotationtype;
409   }
410   
411   public void setOutputAnnotationType(String outputAnnotationType) {
412       this.outputAnnotationType = outputAnnotationType;
413   }
414   
415   public String getOutputASName() {
416       return this.outputASName;
417   }
418   
419   public void setOutputASName(String outputASName) {
420       this.outputASName = outputASName;
421   }
422   
423   protected hepple.postag.POSTagger tagger;
424   private java.net.URL lexiconURL;
425   private java.net.URL rulesURL;
426   private String inputASName;
427   private String encoding;
428   private String baseTokenAnnotationType;
429   private String baseSentenceAnnotationType;
430   private String outputAnnotationType;
431   private String outputASName;
432 }