001 package gate.creole.annic.apache.lucene.search;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020
021 import java.util.Collection;
022 import java.util.Iterator;
023
024 import gate.creole.annic.apache.lucene.index.Term;
025
026 import gate.creole.annic.apache.lucene.index.IndexReader; // for javadoc
027 import gate.creole.annic.apache.lucene.index.IndexWriter; // for javadoc
028 import gate.creole.annic.apache.lucene.document.Field; // for javadoc
029
030
031 /** Expert: Scoring API.
032 * <p>Subclasses implement search scoring.
033 *
034 * <p>The score of query <code>q</code> for document <code>d</code> is defined
035 * in terms of these methods as follows:
036 *
037 * <table cellpadding="0" cellspacing="0" border="0">
038 * <tr>
039 * <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
040 * <td valign="middle" align="center">
041 * <big><big><big><big><big>Σ</big></big></big></big></big></td>
042 * <td valign="middle"><small>
043 * {@link #tf(int) tf}(t in d) *
044 * {@link #idf(Term,Searcher) idf}(t) *
045 * {@link Field#getBoost getBoost}(t.field in d) *
046 * {@link #lengthNorm(String,int) lengthNorm}(t.field in d)
047 * </small></td>
048 * <td valign="middle" rowspan="2"> *
049 * {@link #coord(int,int) coord}(q,d) *
050 * {@link #queryNorm(float) queryNorm}(q)
051 * </td>
052 * </tr>
053 * <tr>
054 * <td valign="top" align="right">
055 * <small>t in q</small>
056 * </td>
057 * </tr>
058 * </table>
059 *
060 * @see #setDefault(Similarity)
061 * @see IndexWriter#setSimilarity(Similarity)
062 * @see Searcher#setSimilarity(Similarity)
063 */
064 public abstract class Similarity {
065 /** The Similarity implementation used by default. */
066 private static Similarity defaultImpl = new DefaultSimilarity();
067
068 /** Set the default Similarity implementation used by indexing and search
069 * code.
070 *
071 * @see Searcher#setSimilarity(Similarity)
072 * @see IndexWriter#setSimilarity(Similarity)
073 */
074 public static void setDefault(Similarity similarity) {
075 Similarity.defaultImpl = similarity;
076 }
077
078 /** Return the default Similarity implementation used by indexing and search
079 * code.
080 *
081 * <p>This is initially an instance of {@link DefaultSimilarity}.
082 *
083 * @see Searcher#setSimilarity(Similarity)
084 * @see IndexWriter#setSimilarity(Similarity)
085 */
086 public static Similarity getDefault() {
087 return Similarity.defaultImpl;
088 }
089
090 /** Cache of decoded bytes. */
091 private static final float[] NORM_TABLE = new float[256];
092
093 static {
094 for (int i = 0; i < 256; i++)
095 NORM_TABLE[i] = byteToFloat((byte)i);
096 }
097
098 /** Decodes a normalization factor stored in an index.
099 * @see #encodeNorm(float)
100 */
101 public static float decodeNorm(byte b) {
102 return NORM_TABLE[b & 0xFF];
103 }
104
105 /** Computes the normalization value for a field given the total number of
106 * terms contained in a field. These values, together with field boosts, are
107 * stored in an index and multipled into scores for hits on each field by the
108 * search code.
109 *
110 * <p>Matches in longer fields are less precise, so implemenations of this
111 * method usually return smaller values when <code>numTokens</code> is large,
112 * and larger values when <code>numTokens</code> is small.
113 *
114 * <p>That these values are computed under {@link
115 * IndexWriter#addDocument(Document)} and stored then using
116 * {#encodeNorm(float)}. Thus they have limited precision, and documents
117 * must be re-indexed if this method is altered.
118 *
119 * @param fieldName the name of the field
120 * @param numTokens the total number of tokens contained in fields named
121 * <i>fieldName</i> of <i>doc</i>.
122 * @return a normalization factor for hits on this field of this document
123 *
124 * @see Field#setBoost(float)
125 */
126 public abstract float lengthNorm(String fieldName, int numTokens);
127
128 /** Computes the normalization value for a query given the sum of the squared
129 * weights of each of the query terms. This value is then multipled into the
130 * weight of each query term.
131 *
132 * <p>This does not affect ranking, but rather just attempts to make scores
133 * from different queries comparable.
134 *
135 * @param sumOfSquaredWeights the sum of the squares of query term weights
136 * @return a normalization factor for query weights
137 */
138 public abstract float queryNorm(float sumOfSquaredWeights);
139
140 /** Encodes a normalization factor for storage in an index.
141 *
142 * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
143 * representing values from around 7x10^9 to 2x10^-9 with about one
144 * significant decimal digit of accuracy. Zero is also represented.
145 * Negative numbers are rounded up to zero. Values too large to represent
146 * are rounded down to the largest representable value. Positive values too
147 * small to represent are rounded up to the smallest positive representable
148 * value.
149 *
150 * @see Field#setBoost(float)
151 */
152 public static byte encodeNorm(float f) {
153 return floatToByte(f);
154 }
155
156 private static float byteToFloat(byte b) {
157 if (b == 0) // zero is a special case
158 return 0.0f;
159 int mantissa = b & 7;
160 int exponent = (b >> 3) & 31;
161 int bits = ((exponent+(63-15)) << 24) | (mantissa << 21);
162 return Float.intBitsToFloat(bits);
163 }
164
165 private static byte floatToByte(float f) {
166 if (f < 0.0f) // round negatives up to zero
167 f = 0.0f;
168
169 if (f == 0.0f) // zero is a special case
170 return 0;
171
172 int bits = Float.floatToIntBits(f); // parse float into parts
173 int mantissa = (bits & 0xffffff) >> 21;
174 int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
175
176 if (exponent > 31) { // overflow: use max value
177 exponent = 31;
178 mantissa = 7;
179 }
180
181 if (exponent < 0) { // underflow: use min value
182 exponent = 0;
183 mantissa = 1;
184 }
185
186 return (byte)((exponent << 3) | mantissa); // pack into a byte
187 }
188
189
190 /** Computes a score factor based on a term or phrase's frequency in a
191 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
192 * factor for each term in the query and these products are then summed to
193 * form the initial score for a document.
194 *
195 * <p>Terms and phrases repeated in a document indicate the topic of the
196 * document, so implementations of this method usually return larger values
197 * when <code>freq</code> is large, and smaller values when <code>freq</code>
198 * is small.
199 *
200 * <p>The default implementation calls {@link #tf(float)}.
201 *
202 * @param freq the frequency of a term within a document
203 * @return a score factor based on a term's within-document frequency
204 */
205 public float tf(int freq) {
206 return tf((float)freq);
207 }
208
209 /** Computes the amount of a sloppy phrase match, based on an edit distance.
210 * This value is summed for each sloppy phrase match in a document to form
211 * the frequency that is passed to {@link #tf(float)}.
212 *
213 * <p>A phrase match with a small edit distance to a document passage more
214 * closely matches the document, so implementations of this method usually
215 * return larger values when the edit distance is small and smaller values
216 * when it is large.
217 *
218 * @see PhraseQuery#setSlop(int)
219 * @param distance the edit distance of this sloppy phrase match
220 * @return the frequency increment for this match
221 */
222 public abstract float sloppyFreq(int distance);
223
224 /** Computes a score factor based on a term or phrase's frequency in a
225 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
226 * factor for each term in the query and these products are then summed to
227 * form the initial score for a document.
228 *
229 * <p>Terms and phrases repeated in a document indicate the topic of the
230 * document, so implemenations of this method usually return larger values
231 * when <code>freq</code> is large, and smaller values when <code>freq</code>
232 * is small.
233 *
234 * @param freq the frequency of a term within a document
235 * @return a score factor based on a term's within-document frequency
236 */
237 public abstract float tf(float freq);
238
239 /** Computes a score factor for a simple term.
240 *
241 * <p>The default implementation is:<pre>
242 * return idf(searcher.docFreq(term), searcher.maxDoc());
243 * </pre>
244 *
245 * Note that {@link Searcher#maxDoc()} is used instead of
246 * {@link IndexReader#numDocs()} because it is proportional to
247 * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
248 * so is the other, and in the same direction.
249 *
250 * @param term the term in question
251 * @param searcher the document collection being searched
252 * @return a score factor for the term
253 */
254 public float idf(Term term, Searcher searcher) throws IOException {
255 return idf(searcher.docFreq(term), searcher.maxDoc());
256 }
257
258 /** Computes a score factor for a phrase.
259 *
260 * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
261 * for each term in the phrase.
262 *
263 * @param terms the terms in the phrase
264 * @param searcher the document collection being searched
265 * @return a score factor for the phrase
266 */
267 public float idf(Collection terms, Searcher searcher) throws IOException {
268 float idf = 0.0f;
269 Iterator i = terms.iterator();
270 while (i.hasNext()) {
271 idf += idf((Term)i.next(), searcher);
272 }
273 return idf;
274 }
275
276 /** Computes a score factor based on a term's document frequency (the number
277 * of documents which contain the term). This value is multiplied by the
278 * {@link #tf(int)} factor for each term in the query and these products are
279 * then summed to form the initial score for a document.
280 *
281 * <p>Terms that occur in fewer documents are better indicators of topic, so
282 * implemenations of this method usually return larger values for rare terms,
283 * and smaller values for common terms.
284 *
285 * @param docFreq the number of documents which contain the term
286 * @param numDocs the total number of documents in the collection
287 * @return a score factor based on the term's document frequency
288 */
289 public abstract float idf(int docFreq, int numDocs);
290
291 /** Computes a score factor based on the fraction of all query terms that a
292 * document contains. This value is multiplied into scores.
293 *
294 * <p>The presence of a large portion of the query terms indicates a better
295 * match with the query, so implemenations of this method usually return
296 * larger values when the ratio between these parameters is large and smaller
297 * values when the ratio between them is small.
298 *
299 * @param overlap the number of query terms matched in the document
300 * @param maxOverlap the total number of terms in the query
301 * @return a score factor based on term overlap with the query
302 */
303 public abstract float coord(int overlap, int maxOverlap);
304 }
|