001 /*
002 * DefaultGazeteer.java
003 *
004 * Copyright (c) 1998-2005, The University of Sheffield.
005 *
006 * This file is part of GATE (see http://gate.ac.uk/), and is free
007 * software, licenced under the GNU Library General Public License,
008 * Version 2, June1991.
009 *
010 * A copy of this licence is included in the distribution in the file
011 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
012 *
013 * Valentin Tablan, 03/07/2000
014 * borislav popov 24/03/2002
015 *
016 * $Id: DefaultGazetteer.java 13374 2011-01-30 16:29:46Z johann_p $
017 */
018 package gate.creole.gazetteer;
019
020 import java.util.*;
021
022 import gate.*;
023 import gate.creole.*;
024 import gate.util.*;
025
026 /** This component is responsible for doing lists lookup. The implementation is
027 * based on finite state machines.
028 * The phrases to be recognised should be listed in a set of files, one for
029 * each type of occurrences.
030 * The gazetteer is build with the information from a file that contains the set
031 * of lists (which are files as well) and the associated type for each list.
032 * The file defining the set of lists should have the following syntax:
033 * each list definition should be written on its own line and should contain:
034 * <ol>
035 * <li>the file name (required) </li>
036 * <li>the major type (required) </li>
037 * <li>the minor type (optional)</li>
038 * <li>the language(s) (optional) </li>
039 * </ol>
040 * The elements of each definition are separated by ":".
041 * The following is an example of a valid definition: <br>
042 * <code>personmale.lst:person:male:english</code>
043 * Each list file named in the lists definition file is just a list containing
044 * one entry per line.
045 * When this gazetteer will be run over some input text (a Gate document) it
046 * will generate annotations of type Lookup having the attributes specified in
047 * the definition file.
048 */
049 public class DefaultGazetteer extends AbstractGazetteer
050 implements CustomDuplication {
051
052 /** Debug flag
053 */
054 private static final boolean DEBUG = false;
055
056 public static final String
057 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
058
059 public static final String
060 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
061
062 public static final String
063 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
064
065 public static final String
066 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
067
068 public static final String
069 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
070
071 public static final String
072 DEF_GAZ_LONGEST_MATCH_ONLY_PARAMETER_NAME = "longestMatchOnly";
073
074 public static final String
075 DEF_GAZ_FEATURE_SEPARATOR_PARAMETER_NAME = "gazetteerFeatureSeparator";
076
077 /** The separator used for gazetteer entry features */
078 protected String gazetteerFeatureSeparator;
079
080 /** a map of nodes vs gaz lists */
081 protected Map listsByNode;
082
083 /**
084 * Build a gazetteer using the default lists from the gate resources
085 */
086 public DefaultGazetteer(){
087 }
088
089 /** Does the actual loading and parsing of the lists. This method must be
090 * called before the gazetteer can be used
091 */
092 public Resource init()throws ResourceInstantiationException{
093 fsmStates = new HashSet();
094 initialState = new FSMState(this);
095 if(listsURL == null){
096 throw new ResourceInstantiationException (
097 "No URL provided for gazetteer creation!");
098 }
099 definition = new LinearDefinition();
100 definition.setSeparator(Strings.unescape(gazetteerFeatureSeparator));
101 definition.setURL(listsURL);
102 definition.load();
103 int linesCnt = definition.size();
104 listsByNode = definition.loadLists();
105 Iterator inodes = definition.iterator();
106
107 int nodeIdx = 0;
108 LinearNode node;
109 while (inodes.hasNext()) {
110 node = (LinearNode) inodes.next();
111 fireStatusChanged("Reading " + node.toString());
112 fireProgressChanged(++nodeIdx * 100 / linesCnt);
113 readList(node,true);
114 } // while iline
115 fireProcessFinished();
116 return this;
117 }
118
119
120 /** Reads one lists (one file) of phrases
121 *
122 * @param node the node
123 * @param add if <b>true</b> will add the phrases found in the list to the ones
124 * recognised by this gazetteer, if <b>false</b> the phrases found in the
125 * list will be removed from the list of phrases recognised by this
126 * gazetteer.
127 */
128 protected void readList(LinearNode node, boolean add)
129 throws ResourceInstantiationException{
130 String listName, majorType, minorType, languages;
131 if ( null == node ) {
132 throw new ResourceInstantiationException(" LinearNode node is null ");
133 }
134
135 listName = node.getList();
136 majorType = node.getMajorType();
137 minorType = node.getMinorType();
138 languages = node.getLanguage();
139 GazetteerList gazList = (GazetteerList)listsByNode.get(node);
140 if (null == gazList) {
141 throw new ResourceInstantiationException("gazetteer list not found by node");
142 }
143
144 Iterator iline = gazList.iterator();
145
146 // create default lookup for entries with no arbitrary features
147 Lookup defaultLookup = new Lookup(listName,majorType, minorType, languages);
148 defaultLookup.list = node.getList();
149 if ( null != mappingDefinition){
150 MappingNode mnode = mappingDefinition.getNodeByList(defaultLookup.list);
151 if (null!=mnode){
152 defaultLookup.oClass = mnode.getClassID();
153 defaultLookup.ontology = mnode.getOntologyID();
154 }
155 }//if mapping def
156
157 Lookup lookup;
158 String entry; // the actual gazetteer entry text
159 while(iline.hasNext()){
160 GazetteerNode gazNode = (GazetteerNode)iline.next();
161 entry = gazNode.getEntry();
162
163 Map features = gazNode.getFeatureMap();
164 if (features == null) {
165 lookup = defaultLookup;
166 } else {
167 // create a new Lookup object with features
168 lookup = new Lookup(listName, majorType, minorType, languages);
169 lookup.list = node.getList();
170 if(null != mappingDefinition) {
171 MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
172 if(null != mnode) {
173 lookup.oClass = mnode.getClassID();
174 lookup.ontology = mnode.getOntologyID();
175 }
176 }// if mapping def
177 lookup.features = features;
178 }
179
180 if(add)addLookup(entry, lookup);
181 else removeLookup(entry, lookup);
182 }
183 } // void readList(String listDesc)
184
185 /** Adds one phrase to the list of phrases recognised by this gazetteer
186 *
187 * @param text the phrase to be added
188 * @param lookup the description of the annotation to be added when this
189 * phrase is recognised
190 */
191 public void addLookup(String text, Lookup lookup) {
192 char currentChar;
193 FSMState currentState = initialState;
194 FSMState nextState;
195 Lookup oldLookup;
196 boolean isSpace;
197
198 for(int i = 0; i< text.length(); i++) {
199 currentChar = text.charAt(i);
200 isSpace = Character.isWhitespace(currentChar);
201 if(isSpace) currentChar = ' ';
202 else currentChar = (caseSensitive.booleanValue()) ?
203 currentChar :
204 Character.toUpperCase(currentChar) ;
205 nextState = currentState.next(currentChar);
206 if(nextState == null){
207 nextState = new FSMState(this);
208 currentState.put(currentChar, nextState);
209 if(isSpace) nextState.put(' ',nextState);
210 }
211 currentState = nextState;
212 } //for(int i = 0; i< text.length(); i++)
213
214 currentState.addLookup(lookup);
215 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
216
217 } // addLookup
218
219 /** Removes one phrase to the list of phrases recognised by this gazetteer
220 *
221 * @param text the phrase to be removed
222 * @param lookup the description of the annotation associated to this phrase
223 */
224 public void removeLookup(String text, Lookup lookup) {
225 char currentChar;
226 FSMState currentState = initialState;
227 FSMState nextState;
228 Lookup oldLookup;
229
230 for(int i = 0; i< text.length(); i++) {
231 currentChar = text.charAt(i);
232 if(Character.isWhitespace(currentChar)) currentChar = ' ';
233 nextState = currentState.next(currentChar);
234 if(nextState == null) return;//nothing to remove
235 currentState = nextState;
236 } //for(int i = 0; i< text.length(); i++)
237 currentState.removeLookup(lookup);
238 } // removeLookup
239
240 /** Returns a string representation of the deterministic FSM graph using
241 * GML.
242 */
243 public String getFSMgml() {
244 String res = "graph[ \ndirected 1\n";
245 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
246 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
247 Iterator fsmStatesIter = fsmStates.iterator();
248 while (fsmStatesIter.hasNext()){
249 FSMState currentState = (FSMState)fsmStatesIter.next();
250 int stateIndex = currentState.getIndex();
251 nodes.append("node[ id ");
252 nodes.append(stateIndex);
253 nodes.append(" label \"");
254 nodes.append(stateIndex);
255
256 if(currentState.isFinal()){
257 nodes.append(",F\\n");
258 nodes.append(currentState.getLookupSet());
259 }
260 nodes.append("\" ]\n");
261 edges.append(currentState.getEdgesGML());
262 }
263 res += nodes.toString() + edges.toString() + "]\n";
264 return res;
265 } // getFSMgml
266
267
268 /**
269 * Tests whether a character is internal to a word (i.e. if it's a letter or
270 * a combining mark (spacing or not)).
271 * @param ch the character to be tested
272 * @return a boolean value
273 */
274 public static boolean isWordInternal(char ch){
275 return Character.isLetter(ch) ||
276 Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
277 Character.getType(ch) == Character.NON_SPACING_MARK;
278 }
279
280 /**
281 * This method runs the gazetteer. It assumes that all the needed parameters
282 * are set. If they are not, an exception will be fired.
283 */
284 public void execute() throws ExecutionException{
285 interrupted = false;
286 AnnotationSet annotationSet;
287 //check the input
288 if(document == null) {
289 throw new ExecutionException(
290 "No document to process!"
291 );
292 }
293
294 if(annotationSetName == null ||
295 annotationSetName.equals("")) annotationSet = document.getAnnotations();
296 else annotationSet = document.getAnnotations(annotationSetName);
297
298 fireStatusChanged("Performing look-up in " + document.getName() + "...");
299 String content = document.getContent().toString();
300 int length = content.length();
301 char currentChar;
302 FSMState currentState = initialState;
303 FSMState nextState;
304 FSMState lastMatchingState = null;
305 int matchedRegionEnd = 0;
306 int matchedRegionStart = 0;
307 int charIdx = 0;
308 int oldCharIdx = 0;
309 FeatureMap fm;
310 Lookup currentLookup;
311
312 while(charIdx < length) {
313 currentChar = content.charAt(charIdx);
314 if(Character.isWhitespace(currentChar)) currentChar = ' ';
315 else currentChar = caseSensitive.booleanValue() ?
316 currentChar :
317 Character.toUpperCase(currentChar);
318 nextState = currentState.next(currentChar);
319 if(nextState == null) {
320 //the matching stopped
321 //if we had a successful match then act on it;
322 if(lastMatchingState != null){
323 createLookups(lastMatchingState, matchedRegionStart, matchedRegionEnd,
324 annotationSet);
325 lastMatchingState = null;
326 }
327 //reset the FSM
328 charIdx = matchedRegionStart + 1;
329 matchedRegionStart = charIdx;
330 currentState = initialState;
331 } else{//go on with the matching
332 currentState = nextState;
333 //if we have a successful state then store it
334 if(currentState.isFinal() &&
335 (
336 (!wholeWordsOnly.booleanValue())
337 ||
338 ((matchedRegionStart == 0 ||
339 !isWordInternal(content.charAt(matchedRegionStart - 1)))
340 &&
341 (charIdx + 1 >= content.length() ||
342 !isWordInternal(content.charAt(charIdx + 1)))
343 )
344 )
345 ){
346 //we have a new match
347 //if we had an existing match and we need to annotate prefixes, then
348 //apply it
349 if(!longestMatchOnly && lastMatchingState != null){
350 createLookups(lastMatchingState, matchedRegionStart,
351 matchedRegionEnd, annotationSet);
352 }
353 matchedRegionEnd = charIdx;
354 lastMatchingState = currentState;
355 }
356 charIdx ++;
357 if(charIdx == content.length()){
358 //we can't go on, use the last matching state and restart matching
359 //from the next char
360 if(lastMatchingState != null){
361 //let's add the new annotation(s)
362 createLookups(lastMatchingState, matchedRegionStart,
363 matchedRegionEnd, annotationSet);
364 lastMatchingState = null;
365 }
366 //reset the FSM
367 charIdx = matchedRegionStart + 1;
368 matchedRegionStart = charIdx;
369 currentState = initialState;
370 }
371 }
372 //fire the progress event
373 if(charIdx - oldCharIdx > 256) {
374 fireProgressChanged((100 * charIdx )/ length );
375 oldCharIdx = charIdx;
376 if(isInterrupted()) throw new ExecutionInterruptedException(
377 "The execution of the " + getName() +
378 " gazetteer has been abruptly interrupted!");
379 }
380 } // while(charIdx < length)
381 //we've finished. If we had a stored match, then apply it.
382 if(lastMatchingState != null) {
383 createLookups(lastMatchingState, matchedRegionStart,
384 matchedRegionEnd, annotationSet);
385 }
386 fireProcessFinished();
387 fireStatusChanged("Look-up complete!");
388 } // execute
389
390
391 /**
392 * Creates the Lookup annotations according to a gazetteer match.
393 * @param matchingState the final FSMState that was reached while matching.
394 * @param matchedRegionStart the start of the matched text region.
395 * @param matchedRegionEnd the end of the matched text region.
396 * @param annotationSet the annotation set where the new annotations should
397 * be added.
398 */
399 protected void createLookups(FSMState matchingState, long matchedRegionStart,
400 long matchedRegionEnd, AnnotationSet annotationSet){
401 Iterator lookupIter = matchingState.getLookupSet().iterator();
402 while(lookupIter.hasNext()) {
403 Lookup currentLookup = (Lookup)lookupIter.next();
404 FeatureMap fm = Factory.newFeatureMap();
405 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
406 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
407 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
408 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
409 }
410
411 if(null != currentLookup.minorType)
412 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
413 if(null != currentLookup.languages)
414 fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
415 if(null != currentLookup.features) {
416 fm.putAll(currentLookup.features);
417 }
418 try{
419 annotationSet.add(new Long(matchedRegionStart),
420 new Long(matchedRegionEnd + 1),
421 LOOKUP_ANNOTATION_TYPE,
422 fm);
423 } catch(InvalidOffsetException ioe) {
424 throw new GateRuntimeException(ioe.toString());
425 }
426 }//while(lookupIter.hasNext())
427 }
428
429 /** The initial state of the FSM that backs this gazetteer
430 */
431 protected FSMState initialState;
432
433 /** A set containing all the states of the FSM backing the gazetteer
434 */
435 protected Set fsmStates;
436
437 /**lookup <br>
438 * @param singleItem a single string to be looked up by the gazetteer
439 * @return set of the Lookups associated with the parameter*/
440 public Set lookup(String singleItem) {
441 char currentChar;
442 Set set = new HashSet();
443 FSMState currentState = initialState;
444 FSMState nextState;
445
446 for(int i = 0; i< singleItem.length(); i++) {
447 currentChar = singleItem.charAt(i);
448 if(Character.isWhitespace(currentChar)) currentChar = ' ';
449 nextState = currentState.next(currentChar);
450 if(nextState == null) {
451 return set;
452 }
453 currentState = nextState;
454 } //for(int i = 0; i< text.length(); i++)
455 set = currentState.getLookupSet();
456 return set;
457 }
458
459 public boolean remove(String singleItem) {
460 char currentChar;
461 FSMState currentState = initialState;
462 FSMState nextState;
463 Lookup oldLookup;
464
465 for(int i = 0; i< singleItem.length(); i++) {
466 currentChar = singleItem.charAt(i);
467 if(Character.isWhitespace(currentChar)) currentChar = ' ';
468 nextState = currentState.next(currentChar);
469 if(nextState == null) {
470 return false;
471 }//nothing to remove
472 currentState = nextState;
473 } //for(int i = 0; i< text.length(); i++)
474 currentState.lookupSet = new HashSet();
475 return true;
476 }
477
478 public boolean add(String singleItem, Lookup lookup) {
479 addLookup(singleItem,lookup);
480 return true;
481 }
482
483 /**
484 * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer
485 * by sharing the internal FSM rather than re-loading the lists.
486 */
487 public Resource duplicate(Factory.DuplicationContext ctx)
488 throws ResourceInstantiationException {
489 return Factory.createResource(SharedDefaultGazetteer.class.getName(),
490 Utils.featureMap(
491 SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME,
492 this),
493 Factory.duplicate(this.getFeatures(), ctx),
494 this.getName());
495 }
496
497
498 public static interface Iter
499 {
500 public boolean hasNext();
501 public char next();
502 } // iter class
503
504 /**
505 * class implementing the map using binary search by char as key
506 * to retrieve the corresponding object.
507 */
508 public static class CharMap
509 {
510 char[] itemsKeys = null;
511 Object[] itemsObjs = null;
512
513 /**
514 * resize the containers by one, leaving empty element at position 'index'
515 */
516 void resize(int index)
517 {
518 int newsz = itemsKeys.length + 1;
519 char[] tempKeys = new char[newsz];
520 Object[] tempObjs = new Object[newsz];
521 System.arraycopy(itemsKeys, 0, tempKeys, 0, index);
522 System.arraycopy(itemsObjs, 0, tempObjs, 0, index);
523 System.arraycopy(itemsKeys, index, tempKeys, index + 1, newsz - index - 1);
524 System.arraycopy(itemsObjs, index, tempObjs, index + 1, newsz - index - 1);
525
526 itemsKeys = tempKeys;
527 itemsObjs = tempObjs;
528 } // resize
529
530 /**
531 * get the object from the map using the char key
532 */
533 Object get(char key)
534 {
535 if (itemsKeys == null) return null;
536 int index = Arrays.binarySearch(itemsKeys, key);
537 if (index<0)
538 return null;
539 return itemsObjs[index];
540 }
541 /**
542 * put the object into the char map using the char as the key
543 */
544 Object put(char key, Object value)
545 {
546 if (itemsKeys == null)
547 {
548 itemsKeys = new char[1];
549 itemsKeys[0] = key;
550 itemsObjs = new Object[1];
551 itemsObjs[0] = value;
552 return value;
553 }// if first time
554 int index = Arrays.binarySearch(itemsKeys, key);
555 if (index<0)
556 {
557 index = ~index;
558 resize(index);
559 itemsKeys[index] = key;
560 itemsObjs[index] = value;
561 }
562 return itemsObjs[index];
563 } // put
564
565 }// class CharMap
566
567 /**
568 * @return the gazetteerFeatureSeparator
569 */
570 public String getGazetteerFeatureSeparator() {
571 return gazetteerFeatureSeparator;
572 }
573
574 /**
575 * @param gazetteerFeatureSeparator the gazetteerFeatureSeparator to set
576 */
577 public void setGazetteerFeatureSeparator(String gazetteerFeatureSeparator) {
578 this.gazetteerFeatureSeparator = gazetteerFeatureSeparator;
579 }
580
581 } // DefaultGazetteer
|