001 /*
002 * FlexibleGazetteer.java
003 *
004 * Copyright (c) 2004, The University of Sheffield.
005 *
006 * This file is part of GATE (see http://gate.ac.uk/), and is free
007 * software, licenced under the GNU Library General Public License,
008 * Version 2, June1991.
009 *
010 * A copy of this licence is included in the distribution in the file
011 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
012 *
013 * Niraj Aswani 02/2002
014 *
015 */
016
017 package gate.creole.gazetteer;
018
019 import java.util.*;
020 import gate.util.*;
021 import gate.*;
022 import gate.corpora.DocumentImpl;
023 import gate.creole.*;
024
025 /**
026 * <p>
027 * Title: Flexible Gazetteer
028 * </p>
029 * <p>
030 * The Flexible Gazetteer provides users with the flexibility to choose
031 * </p>
032 * <p>
033 * their own customized input and an external Gazetteer. For example,
034 * </p>
035 * <p>
036 * the user might want to replace words in the text with their base
037 * </p>
038 * <p>
039 * forms (which is an output of the Morphological Analyser) or to
040 * segment
041 * </p>
042 * <p>
043 * a Chinese text (using the Chinese Tokeniser) before running the
044 * </p>
045 * <p>
046 * Gazetteer on the Chinese text.
047 * </p>
048 *
049 * <p>
050 * The Flexible Gazetteer performs lookup over a document based on the
051 * </p>
052 * <p>
053 * values of an arbitrary feature of an arbitrary annotation type, by
054 * </p>
055 * <p>
056 * using an externally provided gazetteer. It is important to use an
057 * </p>
058 * <p>
059 * external gazetteer as this allows the use of any type of gazetteer
060 * </p>
061 * <p>
062 * (e.g. an Ontological gazetteer).
063 * </p>
064 *
065 * @author niraj aswani
066 * @version 1.0
067 */
068
069 public class FlexibleGazetteer extends AbstractLanguageAnalyser implements
070 ProcessingResource {
071
072 /**
073 * Constructor
074 */
075 public FlexibleGazetteer() {
076 changedNodes = new ArrayList();
077 }
078
079 /**
080 * Does the actual loading and parsing of the lists. This method must
081 * be called before the gazetteer can be used
082 */
083 public Resource init() throws ResourceInstantiationException {
084 // check for parameters
085 if(gazetteerInst == null)
086 throw new ResourceInstantiationException("No Gazetteer Provided!");
087
088 return this;
089 }
090
091 /**
092 * This method runs the gazetteer. It assumes that all the needed
093 * parameters are set. If they are not, an exception will be fired.
094 */
095 public void execute() throws ExecutionException {
096 changedNodes = new ArrayList();
097 fireProgressChanged(0);
098 fireStatusChanged("Checking Document...");
099 if(document == null) {
100 throw new ExecutionException("No document to process!");
101 }
102
103 fireStatusChanged("Creating temporary Document...");
104 StringBuffer newdocString = new StringBuffer(document.getContent()
105 .toString());
106 Document tempDoc = null;
107 boolean chineseSplit = false;
108
109 if(inputFeatureNames == null || inputFeatureNames.size() == 0) {
110 inputFeatureNames = new ArrayList();
111 }
112
113 Iterator tokenIter = getSortedAnnotationIterator(document, inputAnnotationSetName);
114 long totalDeductedSpaces = 0;
115 fireStatusChanged("Replacing contents with the feature value...");
116
117 outer: while(tokenIter != null && tokenIter.hasNext()) {
118 Annotation currentToken = (Annotation)tokenIter.next();
119
120 // check if it is a chinesesplit
121 // if it is, replace no space character with a single space
122 if(currentToken.getType().equals(
123 ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE)
124 && ((String)(currentToken.getFeatures()
125 .get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME)))
126 .equals("ChineseSplit")) {
127
128 // for chinese split startnode and end node are same
129 long startOffset = currentToken.getStartNode().getOffset().longValue();
130
131 // because we are adding a space in place of chinesesplit
132 // the endoffset will become newStartOffset + 1
133 long newStartOffset = startOffset - totalDeductedSpaces;
134 long newEndOffset = newStartOffset + 1;
135 NodePosition newNode = new NodePosition(startOffset, startOffset,
136 newStartOffset, newEndOffset, totalDeductedSpaces);
137 chineseSplit = true;
138
139 // here is the addition of space in the document
140 totalDeductedSpaces--;
141 changedNodes.add(newNode);
142 newdocString = newdocString.insert((int)newStartOffset, ' ');
143 continue outer;
144 } // chineseSplit if
145
146 // search in the provided inputFeaturesNames
147 // if the current annotation has a feature value that user
148 // wants to paste on and replace the original string
149 inner: for(int i = 0; i < inputFeatureNames.size(); i++) {
150 String[] keyVal = ((String)(inputFeatureNames.get(i))).split("[.]");
151
152 if(keyVal.length == 2) {
153 // val is the feature name
154 // key is the annotationName
155 if(currentToken.getType().equals(keyVal[0])) {
156 FeatureMap features = currentToken.getFeatures();
157 String newTokenValue = (String)(features.get(keyVal[1]));
158
159 // what if provided feature doesnot exist
160 if(newTokenValue == null) {
161 continue;
162
163 }
164 else {
165 // feature value found so we need to replace it
166 // find the start and end offsets for this token
167 long startOffset = currentToken.getStartNode().getOffset()
168 .longValue();
169 long endOffset = currentToken.getEndNode().getOffset()
170 .longValue();
171
172 // replacement code start
173 long actualLength = endOffset - startOffset;
174 // let us find the difference between the lengths of the
175 // actual string and the newTokenValue
176 long lengthDifference = actualLength - newTokenValue.length();
177
178 // replacement code end
179
180 // so lets find out the new startOffset and endOffset
181 long newStartOffset = startOffset - totalDeductedSpaces;
182 long newEndOffset = newStartOffset + newTokenValue.length();
183 totalDeductedSpaces += lengthDifference;
184
185 // and make the entry for this
186 NodePosition newNode = new NodePosition(startOffset, endOffset,
187 newStartOffset, newEndOffset, totalDeductedSpaces);
188 changedNodes.add(newNode);
189
190 // and finally replace the actual string in the document
191 // with the new document
192 newdocString = newdocString.replace((int)newStartOffset,
193 (int)newStartOffset + (int)actualLength, // replacement code
194 newTokenValue);
195 break inner;
196 }
197 }
198 }
199 }
200 }
201
202 fireStatusChanged("New Document to be processed with Gazetteer...");
203 try {
204 FeatureMap params = Factory.newFeatureMap();
205 params.put("stringContent", newdocString.toString());
206 if(document instanceof DocumentImpl) {
207 params.put("encoding", ((DocumentImpl)document).getEncoding());
208 params.put("markupAware", ((DocumentImpl)document).getMarkupAware());
209 }
210
211 FeatureMap features = Factory.newFeatureMap();
212 // Gate.setHiddenAttribute(features, true);
213 tempDoc = (Document)Factory.createResource("gate.corpora.DocumentImpl",
214 params, features);
215 }
216 catch(ResourceInstantiationException rie) {
217 throw new ExecutionException("Temporary document cannot be created");
218 }
219
220 // lets create the gazetteer based on the provided gazetteer name
221 FeatureMap params = Factory.newFeatureMap();
222 gazetteerInst.setDocument(tempDoc);
223 gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
224
225 fireStatusChanged("Executing Gazetteer...");
226 try {
227 gazetteerInst.execute();
228 }
229 finally {
230 gazetteerInst.setDocument(null);
231 }
232
233 // now the tempDoc has been looked up, we need to shift the tokens
234 // from
235 // this temp document to the original document
236 fireStatusChanged("Transfering new tags to the original one...");
237 Iterator lookupIter = getSortedAnnotationIterator(tempDoc, outputAnnotationSetName);
238 AnnotationSet original = (outputAnnotationSetName == null) ? document
239 .getAnnotations() : document
240 .getAnnotations(outputAnnotationSetName);
241
242 int positionOfI = 0;
243 while(lookupIter != null && lookupIter.hasNext()) {
244 Annotation currentLookup = (Annotation)(lookupIter.next());
245 long startOffset = currentLookup.getStartNode().getOffset().longValue();
246 long endOffset = currentLookup.getEndNode().getOffset().longValue();
247
248 // if there was any change node before the startOffset
249
250 NodePosition toUse = null;
251 int i = positionOfI;
252 for(; i < changedNodes.size(); i++) {
253 NodePosition np = (NodePosition)changedNodes.get(i);
254
255 // continue until we find a node whose new end node has a value
256 // greater than or equal to the current lookup
257 if(np.getNewStartNode() < startOffset) {
258 positionOfI = i;
259 toUse = np;
260 continue;
261 } else {
262 break;
263 }
264 }
265
266 long spacesToAddToSO = toUse != null ? toUse.getDeductedSpaces() : 0;
267
268 toUse = null;
269 for(; i < changedNodes.size(); i++) {
270 NodePosition np = (NodePosition)changedNodes.get(i);
271
272 // continue until we find a node whose new end node has a value
273 // less tgreater than or equal to the current lookup
274 if(np.getNewStartNode() <= endOffset) {
275 toUse = np;
276 continue;
277 } else {
278 break;
279 }
280 }
281
282 long spacesToAddToEO = toUse != null ? toUse.getDeductedSpaces() : spacesToAddToSO;
283
284 try {
285 original.add(new Long(startOffset + spacesToAddToSO), new Long(
286 endOffset + spacesToAddToEO), currentLookup.getType(),
287 currentLookup.getFeatures());
288 }
289 catch(InvalidOffsetException ioe) {
290 throw new ExecutionException(ioe);
291 }
292
293 }
294
295 // now remove the newDoc
296 Factory.deleteResource(tempDoc);
297 fireProcessFinished();
298 }
299
300 /**
301 * Sets the document to work on
302 *
303 * @param doc
304 */
305 public void setDocument(gate.Document doc) {
306 this.document = doc;
307 }
308
309 /**
310 * Returns the document set up by user to work on
311 *
312 * @return a {@link Document}
313 */
314 public gate.Document getDocument() {
315 return this.document;
316 }
317
318 /**
319 * sets the outputAnnotationSetName
320 *
321 * @param annName
322 */
323 public void setOutputAnnotationSetName(String annName) {
324 this.outputAnnotationSetName = annName;
325 }
326
327 /**
328 * Returns the outputAnnotationSetName
329 *
330 * @return a {@link String} value.
331 */
332 public String getOutputAnnotationSetName() {
333 return this.outputAnnotationSetName;
334 }
335
336 /**
337 * sets the inputAnnotationSetName
338 *
339 * @param annName
340 */
341 public void setInputAnnotationSetName(String annName) {
342 this.inputAnnotationSetName = annName;
343 }
344
345 /**
346 * Returns the inputAnnotationSetName
347 *
348 * @return a {@link String} value.
349 */
350 public String getInputAnnotationSetName() {
351 return this.inputAnnotationSetName;
352 }
353
354 /**
355 * Feature names for example: Token.string, Token.root etc... Values
356 * of these features should be used to replace the actual string of
357 * these features. This method allows a user to set the name of such
358 * features
359 *
360 * @param inputs
361 */
362 public void setInputFeatureNames(java.util.List inputs) {
363 this.inputFeatureNames = inputs;
364 }
365
366 /**
367 * Returns the feature names that are provided by the user to use
368 * their values to replace their actual strings in the document
369 *
370 * @return a {@link List} value.
371 */
372 public java.util.List getInputFeatureNames() {
373 return this.inputFeatureNames;
374 }
375
376 public Gazetteer getGazetteerInst() {
377 return this.gazetteerInst;
378 }
379
380 public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
381 this.gazetteerInst = gazetteerInst;
382 }
383
384 /**
385 * This method takes the document and the annotationSetName and then
386 * creates a interator for the annotations available in the document
387 * under the provided annotationSetName
388 *
389 * @param doc
390 * @param annotationSetName
391 * @return an {@link Iterator}
392 */
393 public Iterator getSortedAnnotationIterator(gate.Document doc, String annotationSetName) {
394 AnnotationSet inputAs = (annotationSetName == null)
395 ? doc.getAnnotations()
396 : doc.getAnnotations(annotationSetName);
397 AnnotationSet tempSet = inputAs.get();
398 if(tempSet == null) return null;
399
400 List tokens = new ArrayList(inputAs.get());
401
402 if(tokens == null) return null;
403
404 Comparator offsetComparator = new OffsetComparator();
405 Collections.sort(tokens, offsetComparator);
406 Iterator tokenIter = tokens.listIterator();
407 return tokenIter;
408 }
409
410 // Gazetteer Runtime parameters
411 private gate.Document document;
412
413 private java.lang.String outputAnnotationSetName;
414
415 private java.lang.String inputAnnotationSetName;
416
417 // Flexible Gazetteer parameter
418 private Gazetteer gazetteerInst;
419
420 private java.util.List inputFeatureNames;
421
422 // parameters required within the program
423 private ArrayList changedNodes;
424 }
|