001 /*
002 * POSTagger.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * HepTag was originally written by Mark Hepple, this version contains
013 * modifications by Valentin Tablan and Niraj Aswani.
014 *
015 * $Id: POSTagger.java 12919 2010-08-03 10:31:37Z valyt $
016 */
017
018 /*
019 * INSTRUCTIONS for STAND-ALONE USE
020 *
021 * SYNOPSIS
022 * java hepple.postag.POSTagger [options] file1 [file2 ...]
023 * OPTIONS:
024 * -h, --help : displays this message
025 * -l, --lexicon <lexicon file> : uses specified lexicon
026 * -r, --rules <rules file> : uses specified rules
027 *
028 * NOTE: requires gnu.getopt package
029 */
030
031 /**
032 * Title: HepTag
033 * Description: Mark Hepple's POS tagger
034 * Copyright: Copyright (c) 2001
035 * Company: University of Sheffield
036 * @author Mark Hepple
037 * @version 1.0
038 */
039 package hepple.postag;
040
041
042 import java.io.*;
043 import java.net.URL;
044 import java.util.*;
045
046 import gate.util.BomStrippingInputStreamReader;
047 import gnu.getopt.*;
048
049 import hepple.postag.rules.*;
050
051 /**
052 * A Java POS Tagger
053 *
054 * Author: Mark Hepple (hepple@dcs.shef.ac.uk)
055 *
056 * Input: An ascii text file in "Brill input format", i.e. one
057 * sentence per line, tokens separated by spaces.
058 *
059 * Output: Same text with each token tagged, i.e. "token" -> "token/tag".
060 * Output is just streamed to std-output, so commonly will direct
061 * into some target file.
062 *
063 * Revision: 13/9/00. Version 1.0.
064 *
065 * Comments:
066 *
067 * Implements a version of the decision list based tagging method
068 * described in:
069 *
070 * M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid
071 * Training and Execution of Rule-based Part-of-Speech Taggers.
072 * Proceedings of the 38th Annual Meeting of the Association for
073 * Computational Linguistics (ACL-2000). Hong Kong, October 2000.
074 *
075 * Modified by Niraj Aswani/Ian Roberts to allow explicit specification of the
076 * character encoding to use when reading rules and lexicon files.
077 *
078 * $Id: POSTagger.java 12919 2010-08-03 10:31:37Z valyt $
079 *
080 */
081
082 public class POSTagger {
083
084 // static final int MAXTAGS = 200;
085
086 protected Map rules;
087 // public Rule[] rules = new Rule[MAXTAGS];
088 // public Rule[] lastRules = new Rule[MAXTAGS];
089
090
091 Lexicon lexicon;
092
093 private String encoding;
094
095 static final String staart = "STAART";
096
097 private String[] staartLex = { staart };
098 private String[] deflex_NNP = { "NNP"};
099 private String[] deflex_JJ = { "JJ"};
100 private String[] deflex_CD = { "CD"};
101 private String[] deflex_NNS = { "NNS"};
102 private String[] deflex_RB = { "RB"};
103 private String[] deflex_VBG = { "VBG"};
104 private String[] deflex_NN = { "NN"};
105
106 public String[] wordBuff = { staart,staart,staart,staart,
107 staart,staart,staart };
108
109 public String[] tagBuff = { staart,staart,staart,staart,
110 staart,staart,staart };
111 public String[][] lexBuff = { staartLex,staartLex,staartLex,
112 staartLex,staartLex,staartLex,
113 staartLex };
114
115 /**
116 * Construct a POS tagger using the platform's native encoding to read the
117 * lexicon and rules files.
118 */
119 public POSTagger(URL lexiconURL, URL rulesURL) throws InvalidRuleException,
120 IOException {
121 this(lexiconURL, rulesURL, null);
122 }
123
124 /**
125 * Construct a POS tagger using the specified encoding to read the lexicon
126 * and rules files.
127 */
128 public POSTagger(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException,
129 IOException{
130 this.encoding = encoding;
131 this.lexicon = new Lexicon(lexiconURL, encoding);
132 rules = new HashMap();
133 readRules(rulesURL);
134 }
135
136 /**
137 * Creates a new rule of the required type according to the provided ID.
138 * @param ruleId the ID for the rule to be created
139 */
140 public Rule createNewRule(String ruleId) throws InvalidRuleException{
141 try{
142 String className = "hepple.postag.rules.Rule_" + ruleId;
143 Class ruleClass = Class.forName(className);
144 return (Rule)ruleClass.newInstance();
145 }catch(Exception e){
146 throw new InvalidRuleException("Could not create rule " + ruleId + "!\n" +
147 e.toString());
148 }
149 }
150
151 /**
152 * Runs the tagger over a set of sentences.
153 * @param sentences a {@link java.util.List} of {@link java.util.List}s
154 * of words to be tagged. Each list is a sentence represented as a list of
155 * words.
156 * @return a {@link java.util.List} of {@link java.util.List}s of
157 * {@link java.lang.String}[]. A list of tagged sentences, each sentence
158 * being itself a list having pairs of strings as elements with
159 * the word on the first position and the tag on the second.
160 */
161 public List runTagger(List sentences){
162 List output = new ArrayList();
163 List taggedSentence = new ArrayList();
164 Iterator sentencesIter = sentences.iterator();
165 while(sentencesIter.hasNext()){
166 List sentence = (List)sentencesIter.next();
167 Iterator wordsIter = sentence.iterator();
168 while(wordsIter.hasNext()){
169 String newWord = (String)wordsIter.next();
170 oneStep(newWord, taggedSentence);
171 }//while(wordsIter.hasNext())
172 //finished adding all the words from a sentence, add six more
173 //staarts to flush all words out of the tagging buffer
174 for(int i = 0; i < 6; i++){
175 oneStep(staart, taggedSentence);
176 }
177 //we have a new finished sentence
178 output.add(taggedSentence);
179 taggedSentence = new ArrayList();
180 }//while(sentencesIter.hasNext())
181 return output;
182 }
183
184
185 /**
186 * This method sets the encoding that POS tagger uses to read rules and the
187 * lexicons.
188 *
189 * @deprecated The rules and lexicon are read at construction time, so
190 * setting the encoding later will have no effect.
191 */
192 public void setEncoding(String encoding) {
193 throw new IllegalStateException("Cannot change encoding once POS tagger "
194 + "has been constructed. Use the three "
195 + "argument constructor to specify "
196 + "encoding.");
197 }
198
199 /**
200 * Adds a new word to the window of 7 words (on the last position) and tags
201 * the word currently in the middle (i.e. on position 3). This function
202 * also reads the word on the first position and adds its tag to the
203 * taggedSentence structure as this word would be lost at the next advance.
204 * If this word completes a sentence then it returns true otherwise it
205 * returns false.
206 * @param word the new word
207 * @param taggedSentence a List of pairs of strings representing the results
208 * of tagging the current sentence so far.
209 * @return returns true if a full sentence is now tagged, otherwise false.
210 */
211 protected boolean oneStep(String word, List taggedSentence){
212 //add the new word at the end of the text window
213 for (int i=1 ; i<7 ; i++) {
214 wordBuff[i-1] = wordBuff[i];
215 tagBuff[i-1] = tagBuff[i];
216 lexBuff[i-1] = lexBuff[i];
217 }
218 wordBuff[6] = word;
219 lexBuff[6] = classifyWord(word);
220 tagBuff[6] = lexBuff[6][0];
221
222 //apply the rules to the word in the middle of the text window
223 //Try to fire a rule for the current lexical entry. It may be the case that
224 //no rule applies.
225 List rulesToApply = (List)rules.get(lexBuff[3][0]);
226 if(rulesToApply != null && rulesToApply.size() > 0){
227 Iterator rulesIter = rulesToApply.iterator();
228 //find the first rule that applies, fire it and stop.
229 while(rulesIter.hasNext() && !((Rule)rulesIter.next()).apply(this)){}
230 }
231
232 //save the tagged word from the first position
233 String taggedWord = wordBuff[0];
234 if(taggedWord != staart){
235 taggedSentence.add(new String[]{taggedWord, tagBuff[0]});
236 if(wordBuff[1] == staart){
237 //wordTag[0] was the end of a sentence
238 return true;
239 }//if(wordBuff[1] == staart)
240 }//if(taggedWord != staart)
241 return false;
242
243 }//protected List oneStep(String word, List taggedSentence)
244
245 /**
246 * Reads the rules from the rules input file
247 */
248 public void readRules(URL rulesURL) throws IOException, InvalidRuleException{
249 BufferedReader rulesReader;
250 if(encoding == null) {
251 rulesReader = new BomStrippingInputStreamReader(rulesURL.
252 openStream());
253 } else {
254 rulesReader = new BomStrippingInputStreamReader(rulesURL.
255 openStream(), this.encoding);
256 }
257
258 String line;
259 Rule newRule;
260
261 line = rulesReader.readLine();
262 while(line != null){
263 List ruleParts = new ArrayList();
264 StringTokenizer tokens = new StringTokenizer(line);
265 while (tokens.hasMoreTokens()) ruleParts.add(tokens.nextToken());
266 if (ruleParts.size() < 3) throw new InvalidRuleException(line);
267
268 newRule = createNewRule((String)ruleParts.get(2));
269 newRule.initialise(ruleParts);
270 List existingRules = (List)rules.get(newRule.from);
271 if(existingRules == null){
272 existingRules = new ArrayList();
273 rules.put(newRule.from, existingRules);
274 }
275 existingRules.add(newRule);
276
277 line = rulesReader.readLine();
278 }//while(line != null)
279 }//public void readRules()
280
281 public void showRules(){
282 System.out.println(rules);
283 }
284
285 /**
286 * Attempts to classify an unknown word.
287 * @param wd the word to be classified
288 */
289 protected String[] classifyWord(String wd){
290 String[] result;
291
292 if (wd == staart) return staartLex;
293
294 List categories = (List)lexicon.get(wd);
295 if(categories != null){
296 result = new String[categories.size()];
297 for(int i = 0; i < result.length; i++){
298 result[i] = (String)categories.get(i);
299 }
300 return result;
301 }
302
303 //no lexical entry for the word. Try to guess
304 if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z') return deflex_NNP;
305
306 for (int i=1 ; i < wd.length()-1 ; i++)
307 if (wd.charAt(i) == '-') return deflex_JJ;
308
309 for (int i=0 ; i < wd.length() ; i++)
310 if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9') return deflex_CD;
311
312 if (wd.endsWith("ed") ||
313 wd.endsWith("us") ||
314 wd.endsWith("ic") ||
315 wd.endsWith("ble") ||
316 wd.endsWith("ive") ||
317 wd.endsWith("ary") ||
318 wd.endsWith("ful") ||
319 wd.endsWith("ical") ||
320 wd.endsWith("less")) return deflex_JJ;
321
322 if (wd.endsWith("s")) return deflex_NNS;
323
324 if (wd.endsWith("ly")) return deflex_RB;
325
326 if (wd.endsWith("ing")) return deflex_VBG;
327
328 return deflex_NN;
329 }//private String[] classifyWord(String wd)
330
331
332 /**
333 * Main method. Runs the tagger using the arguments to find the resources
334 * to be used for initialisation and the input file.
335 */
336 public static void main(String[] args){
337 if(args.length == 0) help();
338 try{
339 LongOpt[] options = new LongOpt[]{
340 new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
341 new LongOpt("lexicon", LongOpt.NO_ARGUMENT, null, 'l'),
342 new LongOpt("rules", LongOpt.NO_ARGUMENT, null, 'r')
343 };
344 Getopt getopt = new Getopt("HepTag", args, "hl:r:", options);
345 String lexiconUrlString = null;
346 String rulesUrlString = null;
347 int opt;
348 while( (opt = getopt.getopt()) != -1 ){
349 switch(opt) {
350 // -h
351 case 'h':{
352 help();
353 System.exit(0);
354 break;
355 }
356 // -l new lexicon
357 case 'l':{
358 lexiconUrlString = getopt.getOptarg();
359 break;
360 }
361 // -l new lexicon
362 case 'r':{
363 rulesUrlString = getopt.getOptarg();
364 break;
365 }
366 default:{
367 System.err.println("Invalid option " +
368 args[getopt.getOptind() -1] + "!");
369 System.exit(1);
370 }
371 }//switch(opt)
372 }//while( (opt = g.getopt()) != -1 )
373 String[] fileNames = new String[args.length - getopt.getOptind()];
374 for(int i = getopt.getOptind(); i < args.length; i++){
375 fileNames[i - getopt.getOptind()] = args[i];
376 }
377
378 URL lexiconURL = (lexiconUrlString == null) ?
379 POSTagger.class.
380 getResource("/hepple/resources/sample_lexicon") :
381 new File(lexiconUrlString).toURI().toURL();
382
383 URL rulesURL = (rulesUrlString == null) ?
384 POSTagger.class.
385 getResource("/hepple/resources/sample_ruleset.big") :
386 new File(rulesUrlString).toURI().toURL();
387
388 POSTagger tagger = new POSTagger(lexiconURL, rulesURL);
389
390 for(int i = 0; i < fileNames.length; i++){
391 String file = fileNames[i];
392 BufferedReader reader = new BufferedReader(new FileReader(file));
393 String line = reader.readLine();
394
395 while(line != null){
396 StringTokenizer tokens = new StringTokenizer(line);
397 List sentence = new ArrayList();
398 while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
399 List sentences = new ArrayList();
400 sentences.add(sentence);
401 List result = tagger.runTagger(sentences);
402
403 Iterator iter = result.iterator();
404 while(iter.hasNext()){
405 List sentenceFromTagger = (List)iter.next();
406 Iterator sentIter = sentenceFromTagger.iterator();
407 while(sentIter.hasNext()){
408 String[] tag = (String[])sentIter.next();
409 System.out.print(tag[0] + "/" + tag[1]);
410 if(sentIter.hasNext()) System.out.print(" ");
411 else System.out.println();
412 }//while(sentIter.hasNext())
413 }//while(iter.hasNext())
414 line = reader.readLine();
415 }//while(line != null)
416 //
417 //
418 //
419 // List result = tagger.runTagger(readInput(file));
420 // Iterator iter = result.iterator();
421 // while(iter.hasNext()){
422 // List sentence = (List)iter.next();
423 // Iterator sentIter = sentence.iterator();
424 // while(sentIter.hasNext()){
425 // String[] tag = (String[])sentIter.next();
426 // System.out.print(tag[0] + "/" + tag[1]);
427 // if(sentIter.hasNext()) System.out.print(" ");
428 // else System.out.println();
429 // }//while(sentIter.hasNext())
430 // }//while(iter.hasNext())
431 }//for(int i = 0; i < fileNames.length; i++)
432 }catch(Exception e){
433 e.printStackTrace();
434 }
435 }//public static void main(String[] args)
436
437 /**
438 * Prints the help message
439 */
440 private static void help(){
441 System.out.println(
442 "NAME\n" +
443 "HepTag - a Part-of-Speech tagger\n" +
444 "see http://www.dcs.shef.ac.uk/~hepple/papers/acl00/abstract.html \n\n" +
445 "SYNOPSIS\n\tjava hepple.postag.POSTagger [options] file1 [file2 ...]\n\n" +
446 "OPTIONS:\n" +
447 "-h, --help \n\tdisplays this message\n" +
448 "-l, --lexicon <lexicon file>\n\tuses specified lexicon\n" +
449 "-r, --rules <rules file>\n\tuses specified rules");
450 }
451
452 /**
453 * Reads one input file and creates the structure needed by the tagger
454 * for input.
455 */
456 private static List readInput(String file) throws IOException{
457 BufferedReader reader = new BufferedReader(new FileReader(file));
458 String line = reader.readLine();
459 List result = new ArrayList();
460 while(line != null){
461 StringTokenizer tokens = new StringTokenizer(line);
462 List sentence = new ArrayList();
463 while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
464 result.add(sentence);
465 line = reader.readLine();
466 }//while(line != null)
467 return result;
468 }//private static List readInput(File file) throws IOException
469
470 }//public class POSTagger
|