001 /*
002 * DefaultTokeniser.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Valentin Tablan, 2000
013 *
014 * $Id: SimpleTokeniser.java 12919 2010-08-03 10:31:37Z valyt $
015 */
016
017 package gate.creole.tokeniser;
018
019 import java.io.*;
020 import java.lang.reflect.Field;
021 import java.lang.reflect.Modifier;
022 import java.util.*;
023
024 import gate.*;
025 import gate.creole.*;
026 import gate.util.*;
027
028 //import EDU.auburn.VGJ.graph.ParseError;
029
030 /** Implementation of a Unicode rule based tokeniser.
031 * The tokeniser gets its rules from a file an {@link java.io.InputStream
032 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
033 * of the constructors.
034 * The implementations is based on a finite state machine that is built based
035 * on the set of rules.
036 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
037 * that are separated by the ">" character. The LHS represents a
038 * regular expression that will be matched against the input while the RHS
039 * describes a Gate2 annotation in terms of annotation type and attribute-value
040 * pairs.
041 * The matching is done using Unicode enumarated types as defined by the {@link
042 * java.lang.Character Character} class. At the time of writing this class the
043 * suported Unicode categories were:
044 * <ul>
045 * <li>UNASSIGNED
046 * <li>UPPERCASE_LETTER
047 * <li>LOWERCASE_LETTER
048 * <li>TITLECASE_LETTER
049 * <li>MODIFIER_LETTER
050 * <li>OTHER_LETTER
051 * <li>NON_SPACING_MARK
052 * <li>ENCLOSING_MARK
053 * <li>COMBINING_SPACING_MARK
054 * <li>DECIMAL_DIGIT_NUMBER
055 * <li>LETTER_NUMBER
056 * <li>OTHER_NUMBER
057 * <li>SPACE_SEPARATOR
058 * <li>LINE_SEPARATOR
059 * <li>PARAGRAPH_SEPARATOR
060 * <li>CONTROL
061 * <li>FORMAT
062 * <li>PRIVATE_USE
063 * <li>SURROGATE
064 * <li>DASH_PUNCTUATION
065 * <li>START_PUNCTUATION
066 * <li>END_PUNCTUATION
067 * <li>CONNECTOR_PUNCTUATION
068 * <li>OTHER_PUNCTUATION
069 * <li>MATH_SYMBOL
070 * <li>CURRENCY_SYMBOL
071 * <li>MODIFIER_SYMBOL
072 * <li>OTHER_SYMBOL
073 * </ul>
074 * The accepted operators for the LHS are "+", "*" and "|" having the usual
075 * interpretations of "1 to n occurences", "0 to n occurences" and
076 * "boolean OR".
077 * For instance this is a valid LHS:
078 * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
079 * <br>meaning an uppercase letter followed by one or more lowercase letters.
080 *
081 * The RHS describes an annotation that is to be created and inserted in the
082 * annotation set provided in case of a match. The new annotation will span the
083 * text that has been recognised. The RHS consists in the annotation type
084 * followed by pairs of attributes and associated values.
085 * E.g. for the LHS above a possible RHS can be:<br>
086 * Token;kind=upperInitial;<br>
087 * representing an annotation of type "Token" having one attribute
088 * named "kind" with the value "upperInitial"<br>
089 * The entire rule willbe:<br>
090 * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre>
091 * <br>
092 * The tokeniser ignores all the empty lines or the ones that start with # or
093 * //.
094 *
095 */
096 public class SimpleTokeniser extends AbstractLanguageAnalyser{
097 public static final String
098 SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
099
100 public static final String
101 SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
102
103 public static final String
104 SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
105
106 public static final String
107 SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
108
109 /** Debug flag
110 */
111 private static final boolean DEBUG = false;
112
113 /**
114 * Creates a tokeniser
115 */
116 public SimpleTokeniser(){
117 }
118
119 /**
120 * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
121 * the finite state machine at the core of the tokeniser.
122 *
123 * @exception ResourceInstantiationException
124 */
125 public Resource init() throws ResourceInstantiationException{
126 Reader rulesReader;
127 try{
128 if(rulesURL != null){
129 rulesReader = new BomStrippingInputStreamReader(rulesURL.openStream(), encoding);
130 }else{
131 //no init data, Scream!
132 throw new ResourceInstantiationException(
133 "No URL provided for the rules!");
134 }
135 initialState = new FSMState(this);
136 BufferedReader bRulesReader = new BufferedReader(rulesReader);
137 String line = bRulesReader.readLine();
138 ///String toParse = "";
139 StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
140
141 while (line != null){
142 if(line.endsWith("\\")){
143 ///toParse += line.substring(0,line.length()-1);
144 toParse.append(line.substring(0,line.length()-1));
145 }else{
146 /*toParse += line;
147 parseRule(toParse);
148 toParse = "";
149 */
150 toParse.append(line);
151 parseRule(toParse.toString());
152 toParse.delete(0,toParse.length());
153 }
154 line = bRulesReader.readLine();
155 }
156 eliminateVoidTransitions();
157 }catch(java.io.IOException ioe){
158 throw new ResourceInstantiationException(ioe);
159 }catch(TokeniserException te){
160 throw new ResourceInstantiationException(te);
161 }
162 return this;
163 }
164
165 /**
166 * Prepares this Processing resource for a new run.
167 */
168 public void reset(){
169 document = null;
170 }
171
172 /** Parses one input line containing a tokeniser rule.
173 * This will create the necessary FSMState objects and the links
174 * between them.
175 *
176 * @param line the string containing the rule
177 */
178 void parseRule(String line)throws TokeniserException{
179 //ignore comments
180 if(line.startsWith("#")) return;
181
182 if(line.startsWith("//")) return;
183
184 StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
185 FSMState newState = new FSMState(this);
186
187 initialState.put(null, newState);
188 FSMState finalState = parseLHS(newState, st, LHStoRHS);
189 String rhs = "";
190
191 if(st.hasMoreTokens()) rhs = st.nextToken("\f");
192
193 if(rhs.length() > 0)finalState.setRhs(rhs);
194 } // parseRule
195
196 /** Parses a part or the entire LHS.
197 *
198 * @param startState a FSMState object representing the initial state for
199 * the small FSM that will recognise the (part of) the rule parsed by this
200 * method.
201 * @param st a {@link java.util.StringTokenizer StringTokenizer} that
202 * provides the input
203 * @param until the string that marks the end of the section to be
204 * recognised. This method will first be called by {@link
205 * #parseRule(String)} with " >" in order to parse the entire
206 * LHS. when necessary it will make itself another call to {@link #parseLHS
207 * parseLHS} to parse a region of the LHS (e.g. a
208 * "(",")" enclosed part.
209 */
210 FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
211 throws TokeniserException{
212
213 FSMState currentState = startState;
214 boolean orFound = false;
215 List orList = new LinkedList();
216 String token;
217 token = skipIgnoreTokens(st);
218
219 if(null == token) return currentState;
220
221 FSMState newState;
222 Integer typeId;
223 UnicodeType uType;
224
225 bigwhile: while(!token.equals(until)){
226 if(token.equals("(")){//(..)
227 newState = parseLHS(currentState, st,")");
228 } else if(token.equals("\"")){//"unicode_type"
229 String sType = parseQuotedString(st, "\"");
230 newState = new FSMState(this);
231 typeId = (Integer)stringTypeIds.get(sType);
232
233 if(null == typeId)
234 throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
235 else uType = new UnicodeType(typeId.intValue());
236
237 currentState.put(uType ,newState);
238 } else {// a type with no quotes
239 String sType = token;
240 newState = new FSMState(this);
241 typeId = (Integer)stringTypeIds.get(sType);
242
243 if(null == typeId)
244 throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
245 else uType = new UnicodeType(typeId.intValue());
246
247 currentState.put(uType ,newState);
248 }
249 //treat the operators
250 token = skipIgnoreTokens(st);
251 if(null == token) throw
252 new InvalidRuleException("Tokeniser rule ended too soon!");
253
254 if(token.equals("|")) {
255
256 orFound = true;
257 orList.add(newState);
258 token = skipIgnoreTokens(st);
259 if(null == token) throw
260 new InvalidRuleException("Tokeniser rule ended too soon!");
261
262 continue bigwhile;
263 } else if(orFound) {//done parsing the "|"
264 orFound = false;
265 orList.add(newState);
266 newState = new FSMState(this);
267 Iterator orListIter = orList.iterator();
268
269 while(orListIter.hasNext())
270 ((FSMState)orListIter.next()).put(null, newState);
271 orList.clear();
272 }
273
274 if(token.equals("+")) {
275
276 newState.put(null,currentState);
277 currentState = newState;
278 newState = new FSMState(this);
279 currentState.put(null,newState);
280 token = skipIgnoreTokens(st);
281
282 if(null == token) throw
283 new InvalidRuleException("Tokeniser rule ended too soon!");
284 } else if(token.equals("*")) {
285
286 currentState.put(null,newState);
287 newState.put(null,currentState);
288 currentState = newState;
289 newState = new FSMState(this);
290 currentState.put(null,newState);
291 token = skipIgnoreTokens(st);
292
293 if(null == token) throw
294 new InvalidRuleException("Tokeniser rule ended too soon!");
295 }
296 currentState = newState;
297 }
298 return currentState;
299 } // parseLHS
300
301 /** Parses from the given string tokeniser until it finds a specific
302 * delimiter.
303 * One use for this method is to read everything until the first quote.
304 *
305 * @param st a {@link java.util.StringTokenizer StringTokenizer} that
306 * provides the input
307 * @param until a String representing the end delimiter.
308 */
309 String parseQuotedString(StringTokenizer st, String until)
310 throws TokeniserException {
311
312 String token;
313
314 if(st.hasMoreElements()) token = st.nextToken();
315 else return null;
316
317 ///String type = "";
318 StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
319
320 while(!token.equals(until)){
321 //type += token;
322 type.append(token);
323 if(st.hasMoreElements())token = st.nextToken();
324 else throw new InvalidRuleException("Tokeniser rule ended too soon!");
325 }
326 return type.toString();
327 } // parseQuotedString
328
329 /** Skips the ignorable tokens from the input returning the first significant
330 * token.
331 * The ignorable tokens are defined by {@link #ignoreTokens a set}
332 */
333 protected static String skipIgnoreTokens(StringTokenizer st){
334 Iterator ignorables;
335 boolean ignorableFound = false;
336 String currentToken;
337
338 while(true){
339 if(st.hasMoreTokens()){
340 currentToken = st.nextToken();
341 ignorables = ignoreTokens.iterator();
342 ignorableFound = false;
343
344 while(!ignorableFound && ignorables.hasNext()){
345 if(currentToken.equals((String)ignorables.next()))
346 ignorableFound = true;
347 }
348
349 if(!ignorableFound) return currentToken;
350 } else return null;
351 }
352 }//skipIgnoreTokens
353
354 /* Computes the lambda-closure (aka epsilon closure) of the given set of
355 * states, that is the set of states that are accessible from any of the
356 * states in the given set using only unrestricted transitions.
357 * @return a set containing all the states accessible from this state via
358 * transitions that bear no restrictions.
359 */
360 /**
361 * Converts the finite state machine to a deterministic one.
362 *
363 * @param s
364 */
365 private AbstractSet lambdaClosure(Set s){
366
367 //the stack/queue used by the algorithm
368 LinkedList list = new LinkedList(s);
369
370 //the set to be returned
371 AbstractSet lambdaClosure = new HashSet(s);
372
373 FSMState top;
374 FSMState currentState;
375 Set nextStates;
376 Iterator statesIter;
377
378 while(!list.isEmpty()) {
379 top = (FSMState)list.removeFirst();
380 nextStates = top.nextSet(null);
381
382 if(null != nextStates){
383 statesIter = nextStates.iterator();
384
385 while(statesIter.hasNext()) {
386 currentState = (FSMState)statesIter.next();
387 if(!lambdaClosure.contains(currentState)){
388 lambdaClosure.add(currentState);
389 list.addFirst(currentState);
390 }//if(!lambdaClosure.contains(currentState))
391 }//while(statesIter.hasNext())
392
393 }//if(null != nextStates)
394 }
395 return lambdaClosure;
396 } // lambdaClosure
397
398 /** Converts the FSM from a non-deterministic to a deterministic one by
399 * eliminating all the unrestricted transitions.
400 */
401 void eliminateVoidTransitions() throws TokeniserException {
402
403 //kalina:clear() faster than init() which is called with init()
404 newStates.clear();
405 Set sdStates = new HashSet();
406 LinkedList unmarkedDStates = new LinkedList();
407 DFSMState dCurrentState = new DFSMState(this);
408 Set sdCurrentState = new HashSet();
409
410 sdCurrentState.add(initialState);
411 sdCurrentState = lambdaClosure(sdCurrentState);
412 newStates.put(sdCurrentState, dCurrentState);
413 sdStates.add(sdCurrentState);
414
415 //find out if the new state is a final one
416 Iterator innerStatesIter = sdCurrentState.iterator();
417 String rhs;
418 FSMState currentInnerState;
419 Set rhsClashSet = new HashSet();
420 boolean newRhs = false;
421
422 while(innerStatesIter.hasNext()){
423 currentInnerState = (FSMState)innerStatesIter.next();
424 if(currentInnerState.isFinal()){
425 rhs = currentInnerState.getRhs();
426 rhsClashSet.add(rhs);
427 dCurrentState.rhs = rhs;
428 newRhs = true;
429 }
430 }
431
432 if(rhsClashSet.size() > 1){
433 Err.println("Warning, rule clash: " + rhsClashSet +
434 "\nSelected last definition: " + dCurrentState.rhs);
435 }
436
437 if(newRhs)dCurrentState.buildTokenDesc();
438 rhsClashSet.clear();
439 unmarkedDStates.addFirst(sdCurrentState);
440 dInitialState = dCurrentState;
441 Set nextSet;
442
443 while(!unmarkedDStates.isEmpty()){
444 //Out.println("\n\n=====================" + unmarkedDStates.size());
445 sdCurrentState = (Set)unmarkedDStates.removeFirst();
446 for(int type = 0; type < maxTypeId; type++){
447 //Out.print(type);
448 nextSet = new HashSet();
449 innerStatesIter = sdCurrentState.iterator();
450
451 while(innerStatesIter.hasNext()){
452 currentInnerState = (FSMState)innerStatesIter.next();
453 Set tempSet = currentInnerState.nextSet(type);
454 if(null != tempSet) nextSet.addAll(tempSet);
455 }//while(innerStatesIter.hasNext())
456
457 if(!nextSet.isEmpty()){
458 nextSet = lambdaClosure(nextSet);
459 dCurrentState = (DFSMState)newStates.get(nextSet);
460
461 if(dCurrentState == null){
462
463 //we have a new DFSMState
464 dCurrentState = new DFSMState(this);
465 sdStates.add(nextSet);
466 unmarkedDStates.add(nextSet);
467
468 //check to see whether the new state is a final one
469 innerStatesIter = nextSet.iterator();
470 newRhs =false;
471
472 while(innerStatesIter.hasNext()){
473 currentInnerState = (FSMState)innerStatesIter.next();
474 if(currentInnerState.isFinal()){
475 rhs = currentInnerState.getRhs();
476 rhsClashSet.add(rhs);
477 dCurrentState.rhs = rhs;
478 newRhs = true;
479 }
480 }
481
482 if(rhsClashSet.size() > 1){
483 Err.println("Warning, rule clash: " + rhsClashSet +
484 "\nSelected last definition: " + dCurrentState.rhs);
485 }
486
487 if(newRhs)dCurrentState.buildTokenDesc();
488 rhsClashSet.clear();
489 newStates.put(nextSet, dCurrentState);
490 }
491 ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
492 } // if(!nextSet.isEmpty())
493
494 } // for(byte type = 0; type < 256; type++)
495
496 } // while(!unmarkedDStates.isEmpty())
497
498 } // eliminateVoidTransitions
499
500 /** Returns a string representation of the non-deterministic FSM graph using
501 * GML (Graph modelling language).
502 */
503 public String getFSMgml(){
504 String res = "graph[ \ndirected 1\n";
505 ///String nodes = "", edges = "";
506 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
507 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
508
509 Iterator fsmStatesIter = fsmStates.iterator();
510 while (fsmStatesIter.hasNext()){
511 FSMState currentState = (FSMState)fsmStatesIter.next();
512 int stateIndex = currentState.getIndex();
513 /*nodes += "node[ id " + stateIndex +
514 " label \"" + stateIndex;
515 */
516 nodes.append("node[ id ");
517 nodes.append(stateIndex);
518 nodes.append(" label \"");
519 nodes.append(stateIndex);
520
521 if(currentState.isFinal()){
522 ///nodes += ",F\\n" + currentState.getRhs();
523 nodes.append(",F\\n" + currentState.getRhs());
524 }
525 ///nodes += "\" ]\n";
526 nodes.append("\" ]\n");
527 ///edges += currentState.getEdgesGML();
528 edges.append(currentState.getEdgesGML());
529 }
530 res += nodes.toString() + edges.toString() + "]\n";
531 return res;
532 } // getFSMgml
533
534 /** Returns a string representation of the deterministic FSM graph using
535 * GML.
536 */
537 public String getDFSMgml() {
538 String res = "graph[ \ndirected 1\n";
539 ///String nodes = "", edges = "";
540 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
541 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
542
543 Iterator dfsmStatesIter = dfsmStates.iterator();
544 while (dfsmStatesIter.hasNext()) {
545 DFSMState currentState = (DFSMState)dfsmStatesIter.next();
546 int stateIndex = currentState.getIndex();
547 /* nodes += "node[ id " + stateIndex +
548 " label \"" + stateIndex;
549 */
550 nodes.append("node[ id ");
551 nodes.append(stateIndex);
552 nodes.append(" label \"");
553 nodes.append(stateIndex);
554
555 if(currentState.isFinal()){
556 /// nodes += ",F\\n" + currentState.getRhs();
557 nodes.append(",F\\n" + currentState.getRhs());
558 }
559 /// nodes += "\" ]\n";
560 nodes.append("\" ]\n");
561 /// edges += currentState.getEdgesGML();
562 edges.append(currentState.getEdgesGML());
563 }
564 res += nodes.toString() + edges.toString() + "]\n";
565 return res;
566 } // getDFSMgml
567
568 //no doc required: javadoc will copy it from the interface
569 /** */
570 public FeatureMap getFeatures(){
571 return features;
572 } // getFeatures
573
574 /** */
575 public void setFeatures(FeatureMap features){
576 this.features = features;
577 } // setFeatures
578
579 /**
580 * The method that does the actual tokenisation.
581 */
582 public void execute() throws ExecutionException {
583 interrupted = false;
584 AnnotationSet annotationSet;
585 //check the input
586 if(document == null) {
587 throw new ExecutionException(
588 "No document to tokenise!"
589 );
590 }
591
592 if(annotationSetName == null ||
593 annotationSetName.equals("")) annotationSet = document.getAnnotations();
594 else annotationSet = document.getAnnotations(annotationSetName);
595
596 fireStatusChanged(
597 "Tokenising " + document.getName() + "...");
598
599 String content = document.getContent().toString();
600 int length = content.length();
601 char currentChar;
602
603 DFSMState graphPosition = dInitialState;
604
605 //the index of the first character of the token trying to be recognised
606 int tokenStart = 0;
607
608 //the index of the last character of the last token recognised
609 int lastMatch = -1;
610
611 DFSMState lastMatchingState = null;
612 DFSMState nextState;
613 String tokenString;
614 int charIdx = 0;
615 int oldCharIdx = 0;
616 FeatureMap newTokenFm;
617
618 while(charIdx < length){
619 currentChar = content.charAt(charIdx);
620 // Out.println(
621 // currentChar + typesMnemonics[Character.getType(currentChar)+128]);
622 nextState = graphPosition.next(((Integer)typeIds.get(
623 new Integer(Character.getType(currentChar)))).intValue());
624
625 if( null != nextState ) {
626 graphPosition = nextState;
627 if(graphPosition.isFinal()) {
628 lastMatch = charIdx;
629 lastMatchingState = graphPosition;
630 }
631 charIdx ++;
632 } else {//we have a match!
633 newTokenFm = Factory.newFeatureMap();
634
635 if (null == lastMatchingState) {
636 tokenString = content.substring(tokenStart, tokenStart +1);
637 newTokenFm.put("type","UNKNOWN");
638 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
639 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
640 Integer.toString(tokenString.length()));
641
642 try {
643 annotationSet.add(new Long(tokenStart),
644 new Long(tokenStart + 1),
645 "DEFAULT_TOKEN", newTokenFm);
646 } catch (InvalidOffsetException ioe) {
647 //This REALLY shouldn't happen!
648 ioe.printStackTrace(Err.getPrintWriter());
649 }
650 // Out.println("Default token: " + tokenStart +
651 // "->" + tokenStart + " :" + tokenString + ";");
652 charIdx = tokenStart + 1;
653 } else {
654 tokenString = content.substring(tokenStart, lastMatch + 1);
655 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
656 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
657 Integer.toString(tokenString.length()));
658
659 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
660 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
661 lastMatchingState.getTokenDesc()[i][1]);
662 //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
663 // lastMatchingState.getTokenDesc()[i][1]);
664 }
665
666
667 try {
668 annotationSet.add(new Long(tokenStart),
669 new Long(lastMatch + 1),
670 lastMatchingState.getTokenDesc()[0][0], newTokenFm);
671 } catch(InvalidOffsetException ioe) {
672 //This REALLY shouldn't happen!
673 throw new GateRuntimeException(ioe.toString());
674 }
675
676 // Out.println(lastMatchingState.getTokenDesc()[0][0] +
677 // ": " + tokenStart + "->" + lastMatch +
678 // " :" + tokenString + ";");
679 charIdx = lastMatch + 1;
680 }
681
682 lastMatchingState = null;
683 graphPosition = dInitialState;
684 tokenStart = charIdx;
685 }
686
687 if((charIdx - oldCharIdx > 256)){
688 fireProgressChanged((100 * charIdx )/ length );
689 oldCharIdx = charIdx;
690 if(isInterrupted()) throw new ExecutionInterruptedException();
691 }
692
693 } // while(charIdx < length)
694
695 if (null != lastMatchingState) {
696 tokenString = content.substring(tokenStart, lastMatch + 1);
697 newTokenFm = Factory.newFeatureMap();
698 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
699 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
700 Integer.toString(tokenString.length()));
701
702 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
703 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
704 lastMatchingState.getTokenDesc()[i][1]);
705 }
706
707
708 try {
709 annotationSet.add(new Long(tokenStart),
710 new Long(lastMatch + 1),
711 lastMatchingState.getTokenDesc()[0][0], newTokenFm);
712 } catch(InvalidOffsetException ioe) {
713 //This REALLY shouldn't happen!
714 throw new GateRuntimeException(ioe.toString());
715 }
716
717 }
718
719 reset();
720 fireProcessFinished();
721 fireStatusChanged("Tokenisation complete!");
722 } // run
723
724 /**
725 * Sets the value of the <code>rulesURL</code> property which holds an URL
726 * to the file containing the rules for this tokeniser.
727 * @param newRulesURL
728 */
729 public void setRulesURL(java.net.URL newRulesURL) {
730 rulesURL = newRulesURL;
731 }
732 /**
733 * Gets the value of the <code>rulesURL</code> property hich holds an
734 * URL to the file containing the rules for this tokeniser.
735 */
736 public java.net.URL getRulesURL() {
737 return rulesURL;
738 }
739 /** */
740 public void setAnnotationSetName(String newAnnotationSetName) {
741 annotationSetName = newAnnotationSetName;
742 }
743 /** */
744 public String getAnnotationSetName() {
745 return annotationSetName;
746 }
747 public void setRulesResourceName(String newRulesResourceName) {
748 rulesResourceName = newRulesResourceName;
749 }
750 public String getRulesResourceName() {
751 return rulesResourceName;
752 }
753 public void setEncoding(String newEncoding) {
754 encoding = newEncoding;
755 }
756 public String getEncoding() {
757 return encoding;
758 }
759
760 /** */
761 protected FeatureMap features = null;
762
763 /** the annotations et where the new annotations will be adde
764 */
765 protected String annotationSetName;
766
767 /** The initial state of the non deterministic machin
768 */
769 protected FSMState initialState;
770
771 /** A set containng all the states of the non deterministic machin
772 */
773 protected Set fsmStates = new HashSet();
774
775 /** The initial state of the deterministic machin
776 */
777 protected DFSMState dInitialState;
778
779 /** A set containng all the states of the deterministic machin
780 */
781 protected Set dfsmStates = new HashSet();
782
783 /** The separator from LHS to RH
784 */
785 static String LHStoRHS = ">";
786
787 /** A set of string representing tokens to be ignored (e.g. blanks
788 */
789 static Set ignoreTokens;
790
791 /** maps from int (the static value on {@link java.lang.Character} to int
792 * the internal value used by the tokeniser. The ins values used by the
793 * tokeniser are consecutive values, starting from 0 and going as high as
794 * necessary.
795 * They map all the public static int members on{@link java.lang.Character}
796 */
797 public static Map typeIds;
798
799 /** The maximum int value used internally as a type i
800 */
801 public static int maxTypeId;
802
803 /** Maps the internal type ids to the type name
804 */
805 public static String[] typeMnemonics;
806
807 /** Maps from type names to type internal id
808 */
809 public static Map stringTypeIds;
810
811 /**
812 * This property holds an URL to the file containing the rules for this tokeniser
813 *
814 */
815
816 /** */
817 static protected String defaultResourceName =
818 "creole/tokeniser/DefaultTokeniser.rules";
819
820 private String rulesResourceName;
821 private java.net.URL rulesURL;
822 private String encoding;
823 private transient Vector progressListeners;
824 //kalina: added this as method to minimise too many init() calls
825 protected transient Map newStates = new HashMap();
826
827
828 /** The static initialiser will inspect the class {@link java.lang.Character}
829 * using reflection to find all the public static members and will map them
830 * to ids starting from 0.
831 * After that it will build all the static data: {@link #typeIds}, {@link
832 * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
833 */
834 static{
835 Field[] characterClassFields;
836
837 try{
838 characterClassFields = Class.forName("java.lang.Character").getFields();
839 }catch(ClassNotFoundException cnfe){
840 throw new LuckyException("Could not find the java.lang.Character class!");
841 }
842
843 Collection staticFields = new LinkedList();
844 // JDK 1.4 introduced directionality constants that have the same values as
845 //character types; we need to skip those as well
846 for(int i = 0; i< characterClassFields.length; i++)
847 if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
848 characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
849 staticFields.add(characterClassFields[i]);
850
851 typeIds = new HashMap();
852 maxTypeId = staticFields.size() -1;
853 typeMnemonics = new String[maxTypeId + 1];
854 stringTypeIds = new HashMap();
855
856 Iterator staticFieldsIter = staticFields.iterator();
857 Field currentField;
858 int currentId = 0;
859 String fieldName;
860
861 try {
862 while(staticFieldsIter.hasNext()){
863 currentField = (Field)staticFieldsIter.next();
864 if(currentField.getType().toString().equals("byte")){
865 fieldName = currentField.getName();
866 typeIds.put(new Integer(currentField.getInt(null)),
867 new Integer(currentId));
868 typeMnemonics[currentId] = fieldName;
869 stringTypeIds.put(fieldName, new Integer(currentId));
870 currentId++;
871 }
872 }
873 } catch(Exception e) {
874 throw new LuckyException(e.toString());
875 }
876
877 ignoreTokens = new HashSet();
878 ignoreTokens.add(" ");
879 ignoreTokens.add("\t");
880 ignoreTokens.add("\f");
881 }
882
883 } // class DefaultTokeniser
|