01 /*
02 * Lexicon.java
03 *
04 * Copyright (c) 1995-2010, The University of Sheffield. See the file
05 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
06 *
07 * This file is part of GATE (see http://gate.ac.uk/), and is free
08 * software, licenced under the GNU Library General Public License,
09 * Version 2, June 1991 (in the distribution as file licence.html,
10 * and also available at http://gate.ac.uk/gate/licence.html).
11 *
12 * HepTag was originally written by Mark Hepple, this version contains
13 * modifications by Valentin Tablan and Niraj Aswani.
14 *
15 * $Id: Lexicon.java 12919 2010-08-03 10:31:37Z valyt $
16 */
17 package hepple.postag;
18
19 /**
20 * Title: HepTag
21 * Description: Mark Hepple's POS tagger
22 * Copyright: Copyright (c) 2001
23 * Company: University of Sheffield
24 * @author Mark Hepple
25 * @version 1.0
26 */
27
28 import gate.util.BomStrippingInputStreamReader;
29
30 import java.util.*;
31 import java.io.*;
32 import java.net.URL;
33
34 /**
35 * A {@link java.util.HashMap} that maps from lexical entry
36 * ({@link java.lang.String}) to possible POS categories
37 * ({@link java.util.List}
38 */
39 class Lexicon extends HashMap {
40
41 private String encoding;
42
43
44 /**
45 * @deprecated The lexicon file is read at construction time, so setting the
46 * encoding later will have no effect. Use the two argument constructor to
47 * set the encoding.
48 */
49 public void setEncoding(String encoding) {
50 throw new IllegalStateException("Cannot change encoding once POS tagger "
51 + "has been constructed. Use the three "
52 + "argument constructor to specify "
53 + "encoding.");
54 }
55
56 /**
57 * Constructor.
58 * @param lexiconURL an URL for the file contianing the lexicon.
59 */
60 public Lexicon(URL lexiconURL) throws IOException{
61 this(lexiconURL, null);
62 }
63
64 /**
65 * Constructor.
66 * @param lexiconURL an URL for the file contianing the lexicon.
67 * @param encoding the character encoding to use for reading the lexicon.
68 */
69 public Lexicon(URL lexiconURL, String encoding) throws IOException{
70 this.encoding = encoding;
71 String line;
72 BufferedReader lexiconReader;
73 if(encoding == null) {
74 lexiconReader = new BomStrippingInputStreamReader(lexiconURL.openStream());
75 } else {
76 lexiconReader = new BomStrippingInputStreamReader(lexiconURL.openStream(),encoding);
77 }
78
79 line = lexiconReader.readLine();
80 String entry;
81 List categories;
82 while(line != null){
83 StringTokenizer tokens = new StringTokenizer(line);
84 entry = tokens.nextToken();
85 categories = new ArrayList();
86 while(tokens.hasMoreTokens()) categories.add(tokens.nextToken());
87 put(entry, categories);
88
89 line = lexiconReader.readLine();
90 }//while(line != null)
91 }//public Lexicon(URL lexiconURL) throws IOException
92
93 }//class Lexicon
|