001 package gate.creole.annic.apache.lucene.analysis;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.util.*;
020
021 /** A Token is an occurence of a term from the text of a field. It consists of
022 a term's text, the start and end offset of the term in the text of the field,
023 and a type string.
024
025 The start and end offsets permit applications to re-associate a token with
026 its source text, e.g., to display highlighted query terms in a document
027 browser, or to show matching text fragments in a KWIC (KeyWord In Context)
028 display, etc.
029
030 The type is an interned string, assigned by a lexical analyzer
031 (a.k.a. tokenizer), naming the lexical or syntactic class that the token
032 belongs to. For example an end of sentence marker token might be implemented
033 with type "eos". The default token type is "word". */
034
035 import java.io.Serializable;
036
037 public final class Token implements Serializable {
038 String termText; // the text of the term
039 int startOffset; // start in source text
040 int endOffset; // end in source text
041 String type = "word"; // lexical type
042 int position;
043
044 private int positionIncrement = 1;
045
046 /** Constructs a Token with the given term text, and start & end offsets.
047 The type defaults to "word." */
048 public Token(String text, int start, int end) {
049 termText = text;
050 startOffset = start;
051 endOffset = end;
052 }
053
054 /** Constructs a Token with the given text, start and end offsets, & type. */
055 public Token(String text, int start, int end, String typ) {
056 termText = text;
057 startOffset = start;
058 endOffset = end;
059 type = typ;
060 }
061
062 /** Set the position increment. This determines the position of this token
063 * relative to the previous Token in a {@link TokenStream}, used in phrase
064 * searching.
065 *
066 * <p>The default value is one.
067 *
068 * <p>Some common uses for this are:<ul>
069 *
070 * <li>Set it to zero to put multiple terms in the same position. This is
071 * useful if, e.g., a word has multiple stems. Searches for phrases
072 * including either stem will match. In this case, all but the first stem's
073 * increment should be set to zero: the increment of the first instance
074 * should be one. Repeating a token with an increment of zero can also be
075 * used to boost the scores of matches on that token.
076 *
077 * <li>Set it to values greater than one to inhibit exact phrase matches.
078 * If, for example, one does not want phrases to match across removed stop
079 * words, then one could build a stop word filter that removes stop words and
080 * also sets the increment to the number of stop words removed before each
081 * non-stop word. Then exact phrase queries will only match when the terms
082 * occur with no intervening stop words.
083 *
084 * </ul>
085 * @see gate.creole.annic.apache.lucene.index.TermPositions
086 */
087 public void setPositionIncrement(int positionIncrement) {
088 if (positionIncrement < 0)
089 throw new IllegalArgumentException
090 ("Increment must be zero or greater: " + positionIncrement);
091 this.positionIncrement = positionIncrement;
092 }
093
094 public void setPosition(int pos) {
095 this.position = pos;
096 }
097
098 public int getPosition() {
099 return position;
100 }
101
102 /** Returns the position increment of this Token.
103 * @see #setPositionIncrement
104 */
105 public int getPositionIncrement() { return positionIncrement; }
106
107 /** Returns the Token's term text. */
108 public final String termText() { return termText; }
109
110 /** Returns this Token's starting offset, the position of the first character
111 corresponding to this token in the source text.
112
113 Note that the difference between endOffset() and startOffset() may not be
114 equal to termText.length(), as the term text may have been altered by a
115 stemmer or some other filter. */
116 public final int startOffset() { return startOffset; }
117
118 /** Returns this Token's ending offset, one greater than the position of the
119 last character corresponding to this token in the source text. */
120 public final int endOffset() { return endOffset; }
121
122 /** Returns this Token's lexical type. Defaults to "word". */
123 public final String type() { return type; }
124
125 }
|