001 package gate.creole.annic.apache.lucene.document;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.Reader;
020 import java.util.Date;
021 import gate.creole.annic.apache.lucene.index.IndexReader; // for javadoc
022 import gate.creole.annic.apache.lucene.search.Similarity; // for javadoc
023 import gate.creole.annic.apache.lucene.search.Hits; // for javadoc
024
025 /**
026 A field is a section of a Document. Each field has two parts, a name and a
027 value. Values may be free text, provided as a String or as a Reader, or they
028 may be atomic keywords, which are not further processed. Such keywords may
029 be used to represent dates, urls, etc. Fields are optionally stored in the
030 index, so that they may be returned with hits on the document.
031 */
032
033 public final class Field implements java.io.Serializable {
034 private String name = "body";
035 private String stringValue = null;
036 private boolean storeTermVector = false;
037 private Reader readerValue = null;
038 private boolean isStored = false;
039 private boolean isIndexed = true;
040 private boolean isTokenized = true;
041
042 private float boost = 1.0f;
043
044 /** Sets the boost factor hits on this field. This value will be
045 * multiplied into the score of all hits on this this field of this
046 * document.
047 *
048 * <p>The boost is multiplied by {@link Document#getBoost()} of the document
049 * containing this field. If a document has multiple fields with the same
050 * name, all such values are multiplied together. This product is then
051 * multipled by the value {@link Similarity#lengthNorm(String,int)}, and
052 * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
053 * index. One should attempt to ensure that this product does not overflow
054 * the range of that encoding.
055 *
056 * @see Document#setBoost(float)
057 * @see Similarity#lengthNorm(String, int)
058 * @see Similarity#encodeNorm(float)
059 */
060 public void setBoost(float boost) {
061 this.boost = boost;
062 }
063
064 /** Returns the boost factor for hits on any field of this document.
065 *
066 * <p>The default value is 1.0.
067 *
068 * <p>Note: this value is not stored directly with the document in the index.
069 * Documents returned from {@link IndexReader#document(int)} and {@link
070 * Hits#doc(int)} may thus not have the same value present as when this field
071 * was indexed.
072 *
073 * @see #setBoost(float)
074 */
075 public float getBoost() {
076 return boost;
077 }
078
079 /** Constructs a String-valued Field that is not tokenized, but is indexed
080 and stored. Useful for non-text fields, e.g. date or url.
081 */
082 public static final Field Keyword(String name, String value) {
083 return new Field(name, value, true, true, false);
084 }
085
086 /** Constructs a String-valued Field that is not tokenized nor indexed,
087 but is stored in the index, for return with hits. */
088 public static final Field UnIndexed(String name, String value) {
089 return new Field(name, value, true, false, false);
090 }
091
092 /** Constructs a String-valued Field that is tokenized and indexed,
093 and is stored in the index, for return with hits. Useful for short text
094 fields, like "title" or "subject". Term vector will not be stored for this field. */
095 public static final Field Text(String name, String value) {
096 return Text(name, value, false);
097 }
098
099 /** Constructs a Date-valued Field that is not tokenized and is indexed,
100 and stored in the index, for return with hits. */
101 public static final Field Keyword(String name, Date value) {
102 return new Field(name, DateField.dateToString(value), true, true, false);
103 }
104
105 /** Constructs a String-valued Field that is tokenized and indexed,
106 and is stored in the index, for return with hits. Useful for short text
107 fields, like "title" or "subject". */
108 public static final Field Text(String name, String value, boolean storeTermVector) {
109 return new Field(name, value, true, true, true, storeTermVector);
110 }
111
112 /** Constructs a String-valued Field that is tokenized and indexed,
113 but that is not stored in the index. Term vector will not be stored for this field. */
114 public static final Field UnStored(String name, String value) {
115 return UnStored(name, value, false);
116 }
117
118 /** Constructs a String-valued Field that is tokenized and indexed,
119 but that is not stored in the index. */
120 public static final Field UnStored(String name, String value, boolean storeTermVector) {
121 return new Field(name, value, false, true, true, storeTermVector);
122 }
123
124 /** Constructs a Reader-valued Field that is tokenized and indexed, but is
125 not stored in the index verbatim. Useful for longer text fields, like
126 "body". Term vector will not be stored for this field. */
127 public static final Field Text(String name, Reader value) {
128 return Text(name, value, false);
129 }
130
131 /** Constructs a Reader-valued Field that is tokenized and indexed, but is
132 not stored in the index verbatim. Useful for longer text fields, like
133 "body". */
134 public static final Field Text(String name, Reader value, boolean storeTermVector) {
135 Field f = new Field(name, value);
136 f.storeTermVector = storeTermVector;
137 return f;
138 }
139
140 /** The name of the field (e.g., "date", "subject", "title", or "body")
141 as an interned string. */
142 public String name() { return name; }
143
144 /** The value of the field as a String, or null. If null, the Reader value
145 is used. Exactly one of stringValue() and readerValue() must be set. */
146 public String stringValue() { return stringValue; }
147 /** The value of the field as a Reader, or null. If null, the String value
148 is used. Exactly one of stringValue() and readerValue() must be set. */
149 public Reader readerValue() { return readerValue; }
150
151
152 /** Create a field by specifying all parameters except for <code>storeTermVector</code>,
153 * which is set to <code>false</code>.
154 */
155 public Field(String name, String string,
156 boolean store, boolean index, boolean token) {
157 this(name, string, store, index, token, false);
158 }
159
160 /**
161 *
162 * @param name The name of the field
163 * @param string The string to process
164 * @param store true if the field should store the string
165 * @param index true if the field should be indexed
166 * @param token true if the field should be tokenized
167 * @param storeTermVector true if we should store the Term Vector info
168 */
169 public Field(String name, String string,
170 boolean store, boolean index, boolean token, boolean storeTermVector) {
171 if (name == null)
172 throw new IllegalArgumentException("name cannot be null");
173 if (string == null)
174 throw new IllegalArgumentException("value cannot be null");
175 if (!index && storeTermVector)
176 throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed.");
177
178 this.name = name.intern(); // field names are interned
179 this.stringValue = string;
180 this.isStored = store;
181 this.isIndexed = index;
182 this.isTokenized = token;
183 this.storeTermVector = storeTermVector;
184 }
185
186 Field(String name, Reader reader) {
187 if (name == null)
188 throw new IllegalArgumentException("name cannot be null");
189 if (reader == null)
190 throw new IllegalArgumentException("value cannot be null");
191
192 this.name = name.intern(); // field names are interned
193 this.readerValue = reader;
194 }
195
196 /** True iff the value of the field is to be stored in the index for return
197 with search hits. It is an error for this to be true if a field is
198 Reader-valued. */
199 public final boolean isStored() { return isStored; }
200
201 /** True iff the value of the field is to be indexed, so that it may be
202 searched on. */
203 public final boolean isIndexed() { return isIndexed; }
204
205 /** True iff the value of the field should be tokenized as text prior to
206 indexing. Un-tokenized fields are indexed as a single word and may not be
207 Reader-valued. */
208 public final boolean isTokenized() { return isTokenized; }
209
210 /** True iff the term or terms used to index this field are stored as a term
211 * vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
212 * These methods do not provide access to the original content of the field,
213 * only to terms used to index it. If the original content must be
214 * preserved, use the <code>stored</code> attribute instead.
215 *
216 * @see IndexReader#getTermFreqVector(int, String)
217 */
218 public final boolean isTermVectorStored() { return storeTermVector; }
219
220 /** Prints a Field for human consumption. */
221 public final String toString() {
222 if (isStored && isIndexed && !isTokenized)
223 return "Keyword<" + name + ":" + stringValue + ">";
224 else if (isStored && !isIndexed && !isTokenized)
225 return "Unindexed<" + name + ":" + stringValue + ">";
226 else if (isStored && isIndexed && isTokenized && stringValue!=null)
227 return "Text<" + name + ":" + stringValue + ">";
228 else if (!isStored && isIndexed && isTokenized && readerValue!=null)
229 return "Text<" + name + ":" + readerValue + ">";
230 else if (!isStored && isIndexed && isTokenized)
231 {
232 return "UnStored<" + name + ">";
233 }
234 else
235 {
236 return super.toString();
237 }
238 }
239
240 }
|