001 package gate.creole.morph;
002
003
004 /*
005 * Morph.java
006 *
007 * Copyright (c) 1998-2005, The University of Sheffield.
008 *
009 * This file is part of GATE (see http://gate.ac.uk/), and is free
010 * software, licenced under the GNU Library General Public License,
011 * Version 2, June1991.
012 *
013 * A copy of this licence is included in the distribution in the file
014 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
015 *
016 * Niraj Aswani, 13/10/2003
017 *
018 * $Id: Morph.java 12483 2010-04-14 11:19:12Z johann_p $
019 */
020
021
022 import java.net.URL;
023 import java.util.Iterator;
024
025 import org.apache.log4j.Logger;
026 import org.apache.log4j.Level;
027
028 import gate.*;
029 import gate.creole.*;
030 import gate.creole.metadata.*;
031 import gate.util.GateRuntimeException;
032
033 /**
034 * Description: This class is a wrapper for {@link gate.creole.morph.Interpret},
035 * the Morphological Analyzer.
036 */
037 @CreoleResource(name = "GATE Morphological analyser",
038 helpURL = "http://gate.ac.uk/userguide/sec:parsers:morpher",
039 comment = "Morphological Analyzer for the English Language")
040 public class Morph
041 extends AbstractLanguageAnalyser
042 implements ProcessingResource {
043
044
045 /** Document to be processed by the morpher, must be provided at Runtime. */
046 protected gate.Document document;
047
048 /** File which contains rules to be processed */
049 protected URL rulesFile;
050
051 /** Instance of BaseWord class - English Morpher */
052 protected Interpret interpret;
053
054 /** Feature Name that should be displayed for the root word */
055 protected String rootFeatureName;
056
057 /** Feature Name that should be displayed for the affix */
058 protected String affixFeatureName;
059
060 /** The name of the annotation set used for input */
061 protected String annotationSetName;
062
063 /** Boolean value that tells if parser should behave in caseSensitive mode */
064 protected Boolean caseSensitive;
065
066 protected Boolean considerPOSTag;
067
068 @RunTime
069 @Optional
070 @CreoleParameter(
071 comment = "Throw and exception when there are none of the required input annotations",
072 defaultValue = "true")
073 public void setFailOnMissingInputAnnotations(Boolean fail) {
074 failOnMissingInputAnnotations = fail;
075 }
076 public Boolean getFailOnMissingInputAnnotations() {
077 return failOnMissingInputAnnotations;
078 }
079 protected Boolean failOnMissingInputAnnotations = false;
080
081 protected Logger logger = Logger.getLogger(this.getClass().getName());
082
083 /** Default Constructor */
084 public Morph() {
085 }
086
087 /**
088 * This method creates the instance of the BaseWord - English Morpher and
089 * returns the instance of current class with different attributes and
090 * the instance of BaseWord class wrapped into it.
091 * @return Resource
092 * @throws ResourceInstantiationException
093 */
094 public Resource init() throws ResourceInstantiationException {
095 interpret = new Interpret();
096 if (rulesFile == null) {
097 // no rule file is there, simply run the interpret to interpret it and
098 throw new ResourceInstantiationException("\n\n No Rule File Provided");
099 }
100
101 fireStatusChanged("Reading Rule File...");
102 // compile the rules
103 interpret.init(rulesFile);
104 fireStatusChanged("Morpher created!");
105 fireProcessFinished();
106 return this;
107 }
108
109 /**
110 * Method is executed after the init() method has finished its execution.
111 * <BR>Method does the following operations:
112 * <OL type="1">
113 * <LI> creates the annotationSet
114 * <LI> fetches word tokens from the document, one at a time
115 * <LI> runs the morpher on each individual word token
116 * <LI> finds the root and the affix for that word
117 * <LI> adds them as features to the current token
118 * @throws ExecutionException
119 */
120 public void execute() throws ExecutionException {
121 // lets start the progress and initialize the progress counter
122 fireProgressChanged(0);
123
124 // If no document provided to process throw an exception
125 if (document == null) {
126 fireProcessFinished();
127 throw new GateRuntimeException("No document to process!");
128 }
129
130 // get the annotationSet name provided by the user, or otherwise use the
131 // default method
132 AnnotationSet inputAs = (annotationSetName == null ||
133 annotationSetName.length() == 0) ?
134 document.getAnnotations() :
135 document.getAnnotations(annotationSetName);
136
137 // Morpher requires English tokenizer to be run before running the Morpher
138 // Fetch tokens from the document
139 AnnotationSet tokens = inputAs.get(TOKEN_ANNOTATION_TYPE);
140 if (tokens == null || tokens.isEmpty()) {
141 fireProcessFinished();
142 if(failOnMissingInputAnnotations) {
143 throw new ExecutionException("Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher");
144 } else {
145 Utils.logOnce(logger,Level.INFO,"Morphological analyser: either a document does not have any contents or run the POS Tagger first - see debug log for details.");
146 logger.debug("No input annotations in document "+document.getName());
147 return;
148 }
149 //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); ;
150 //return;
151 }
152
153 // create iterator to get access to each and every individual token
154 Iterator<Annotation> tokensIter = tokens.iterator();
155
156 // variables used to keep track on progress
157 int tokenSize = tokens.size();
158 int tokensProcessed = 0;
159 int lastReport = 0;
160
161 //lets process each token one at a time
162 while (tokensIter != null && tokensIter.hasNext()) {
163 Annotation currentToken = tokensIter.next();
164 String tokenValue = (String) (currentToken.getFeatures().
165 get(TOKEN_STRING_FEATURE_NAME));
166 if(considerPOSTag != null && considerPOSTag.booleanValue() && !currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
167 fireProcessFinished();
168 if(failOnMissingInputAnnotations) {
169 throw new ExecutionException("please run the POS Tagger first and then Morpher");
170 } else {
171 Utils.logOnce(logger,Level.INFO,"Morphological analyser: no input annotations, run the POS Tagger first - see debug log for details.");
172 logger.debug("No input annotations in document "+document.getName());
173 return;
174 }
175 //javax.swing.JOptionPane.showMessageDialog(MainFrame.getInstance(), "please run the POS Tagger first and then Morpher"); ;
176 //return;
177 }
178
179 String posCategory = (String) (currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME));
180 if(posCategory == null) {
181 posCategory = "*";
182 }
183
184 if(considerPOSTag == null || !considerPOSTag.booleanValue()) {
185 posCategory = "*";
186 }
187
188 // run the Morpher
189 if(!caseSensitive.booleanValue()) {
190 tokenValue = tokenValue.toLowerCase();
191 }
192
193 String baseWord = interpret.runMorpher(tokenValue, posCategory);
194 String affixWord = interpret.getAffix();
195
196 // no need to add affix feature if it is null
197 if (affixWord != null) {
198 currentToken.getFeatures().put(affixFeatureName, affixWord);
199 }
200 // add the root word as a feature
201 currentToken.getFeatures().put(rootFeatureName, baseWord);
202
203 // measure the progress and update every after 100 tokens
204 tokensProcessed++;
205 if(tokensProcessed - lastReport > 100){
206 lastReport = tokensProcessed;
207 fireProgressChanged(tokensProcessed * 100 /tokenSize);
208 }
209 }
210 // process finished, acknowledge user about this.
211 fireProcessFinished();
212 }
213
214 // getter and setter method
215 /**
216 * Sets the document to be processed
217 * @param document - document to be processed
218 */
219 public void setDocument(gate.Document document) {
220 this.document = document;
221 }
222
223
224 /**
225 * This method should only be called after init()
226 * @param word
227 * @return the rootWord
228 */
229 public String findBaseWord(String word, String cat) {
230 return interpret.runMorpher(word, cat);
231 }
232
233 /**
234 * This method should only be called after init()
235 * @param word
236 * @return the afix of the rootWord
237 */
238 public String findAffix(String word, String cat) {
239 interpret.runMorpher(word, cat);
240 return interpret.getAffix();
241 }
242
243
244 /**
245 * Returns the document under process
246 */
247 public gate.Document getDocument() {
248 return this.document;
249 }
250
251 /**
252 * Sets the rule file to be processed
253 * @param rulesFileURL - rule File name to be processed
254 */
255 public void setRulesFile(URL rulesFile) {
256 this.rulesFile = rulesFile;
257 }
258
259 /**
260 * Returns the document under process
261 */
262 public URL getRulesFile() {
263 return this.rulesFile;
264 }
265
266 /**
267 * Returns the feature name that has been currently set to display the root
268 * word
269 */
270 public String getRootFeatureName() {
271 return rootFeatureName;
272 }
273
274 /**
275 * Sets the feature name that should be displayed for the root word
276 * @param rootFeatureName
277 */
278 public void setRootFeatureName(String rootFeatureName) {
279 this.rootFeatureName = rootFeatureName;
280 }
281
282 /**
283 * Returns the feature name that has been currently set to display the affix
284 * word
285 */
286 public String getAffixFeatureName() {
287 return affixFeatureName;
288 }
289
290 /**
291 * Sets the feature name that should be displayed for the affix
292 * @param affixFeatureName
293 */
294 public void setAffixFeatureName(String affixFeatureName) {
295 this.affixFeatureName = affixFeatureName;
296 }
297
298 /**
299 * Returns the name of the AnnotationSet that has been provided to create
300 * the AnnotationSet
301 */
302 public String getAnnotationSetName() {
303 return annotationSetName;
304 }
305
306 /**
307 * Sets the AnnonationSet name, that is used to create the AnnotationSet
308 * @param annotationSetName
309 */
310 public void setAnnotationSetName(String annotationSetName) {
311 this.annotationSetName = annotationSetName;
312 }
313
314 /**
315 * A method which returns if the parser is in caseSenstive mode
316 * @return a {@link Boolean} value.
317 */
318 public Boolean getCaseSensitive() {
319 return this.caseSensitive;
320 }
321
322 /**
323 * Sets the caseSensitive value, that is used to tell parser if it should
324 * convert document to lowercase before parsing
325 */
326 public void setCaseSensitive(java.lang.Boolean value) {
327 this.caseSensitive = value;
328 }
329
330 public Boolean getConsiderPOSTag() {
331 return this.considerPOSTag;
332 }
333
334 public void setConsiderPOSTag(Boolean value) {
335 this.considerPOSTag = value;
336 }
337 }
|