001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020 import java.util.Collection;
021 import java.util.HashSet;
022 import java.util.Hashtable;
023 import java.util.Iterator;
024 import java.util.Set;
025
026 import gate.creole.annic.apache.lucene.document.Document;
027 import gate.creole.annic.apache.lucene.store.Directory;
028
029 /** An IndexReader which reads multiple indexes, appending their content.
030 *
031 * @version $Id: MultiReader.java 529 2004-10-05 11:55:26Z niraj $
032 */
033 public class MultiReader extends IndexReader {
034 private IndexReader[] subReaders;
035 private int[] starts; // 1st docno for each segment
036 private Hashtable normsCache = new Hashtable();
037 private int maxDoc = 0;
038 private int numDocs = -1;
039 private boolean hasDeletions = false;
040
041 /**
042 * <p>Construct a MultiReader aggregating the named set of (sub)readers.
043 * Directory locking for delete, undeleteAll, and setNorm operations is
044 * left to the subreaders. </p>
045 * <p>Note that all subreaders are closed if this Multireader is closed.</p>
046 * @param subReaders set of (sub)readers
047 * @throws IOException
048 */
049 public MultiReader(IndexReader[] subReaders) throws IOException {
050 super(subReaders.length == 0 ? null : subReaders[0].directory());
051 initialize(subReaders);
052 }
053
054 /** Construct reading the named set of readers. */
055 MultiReader(Directory directory, SegmentInfos sis, boolean closeDirectory, IndexReader[] subReaders)
056 throws IOException {
057 super(directory, sis, closeDirectory);
058 initialize(subReaders);
059 }
060
061 private void initialize(IndexReader[] subReaders) throws IOException{
062 this.subReaders = subReaders;
063 starts = new int[subReaders.length + 1]; // build starts array
064 for (int i = 0; i < subReaders.length; i++) {
065 starts[i] = maxDoc;
066 maxDoc += subReaders[i].maxDoc(); // compute maxDocs
067
068 if (subReaders[i].hasDeletions())
069 hasDeletions = true;
070 }
071 starts[subReaders.length] = maxDoc;
072 }
073
074
075 /** Return an array of term frequency vectors for the specified document.
076 * The array contains a vector for each vectorized field in the document.
077 * Each vector vector contains term numbers and frequencies for all terms
078 * in a given vectorized field.
079 * If no such fields existed, the method returns null.
080 */
081 public TermFreqVector[] getTermFreqVectors(int n) throws IOException {
082 int i = readerIndex(n); // find segment num
083 return subReaders[i].getTermFreqVectors(n - starts[i]); // dispatch to segment
084 }
085
086 public TermFreqVector getTermFreqVector(int n, String field)
087 throws IOException {
088 int i = readerIndex(n); // find segment num
089 return subReaders[i].getTermFreqVector(n - starts[i], field);
090 }
091
092 public synchronized int numDocs() {
093 if (numDocs == -1) { // check cache
094 int n = 0; // cache miss--recompute
095 for (int i = 0; i < subReaders.length; i++)
096 n += subReaders[i].numDocs(); // sum from readers
097 numDocs = n;
098 }
099 return numDocs;
100 }
101
102 public int maxDoc() {
103 return maxDoc;
104 }
105
106 public Document document(int n) throws IOException {
107 int i = readerIndex(n); // find segment num
108 return subReaders[i].document(n - starts[i]); // dispatch to segment reader
109 }
110
111 public boolean isDeleted(int n) {
112 int i = readerIndex(n); // find segment num
113 return subReaders[i].isDeleted(n - starts[i]); // dispatch to segment reader
114 }
115
116 public boolean hasDeletions() { return hasDeletions; }
117
118 protected void doDelete(int n) throws IOException {
119 numDocs = -1; // invalidate cache
120 int i = readerIndex(n); // find segment num
121 subReaders[i].delete(n - starts[i]); // dispatch to segment reader
122 hasDeletions = true;
123 }
124
125 protected void doUndeleteAll() throws IOException {
126 for (int i = 0; i < subReaders.length; i++)
127 subReaders[i].undeleteAll();
128 hasDeletions = false;
129 }
130
131 private int readerIndex(int n) { // find reader for doc n:
132 int lo = 0; // search starts array
133 int hi = subReaders.length - 1; // for first element less
134
135 while (hi >= lo) {
136 int mid = (lo + hi) >>> 1;
137 int midValue = starts[mid];
138 if (n < midValue)
139 hi = mid - 1;
140 else if (n > midValue)
141 lo = mid + 1;
142 else { // found a match
143 while (mid+1 < subReaders.length && starts[mid+1] == midValue) {
144 mid++; // scan to last match
145 }
146 return mid;
147 }
148 }
149 return hi;
150 }
151
152 public synchronized byte[] norms(String field) throws IOException {
153 byte[] bytes = (byte[])normsCache.get(field);
154 if (bytes != null)
155 return bytes; // cache hit
156
157 bytes = new byte[maxDoc()];
158 for (int i = 0; i < subReaders.length; i++)
159 subReaders[i].norms(field, bytes, starts[i]);
160 normsCache.put(field, bytes); // update cache
161 return bytes;
162 }
163
164 public synchronized void norms(String field, byte[] result, int offset)
165 throws IOException {
166 byte[] bytes = (byte[])normsCache.get(field);
167 if (bytes != null) // cache hit
168 System.arraycopy(bytes, 0, result, offset, maxDoc());
169
170 for (int i = 0; i < subReaders.length; i++) // read from segments
171 subReaders[i].norms(field, result, offset + starts[i]);
172 }
173
174 protected void doSetNorm(int n, String field, byte value)
175 throws IOException {
176 normsCache.remove(field); // clear cache
177 int i = readerIndex(n); // find segment num
178 subReaders[i].setNorm(n-starts[i], field, value); // dispatch
179 }
180
181 public TermEnum terms() throws IOException {
182 return new MultiTermEnum(subReaders, starts, null);
183 }
184
185 public TermEnum terms(Term term) throws IOException {
186 return new MultiTermEnum(subReaders, starts, term);
187 }
188
189 public int docFreq(Term t) throws IOException {
190 int total = 0; // sum freqs in segments
191 for (int i = 0; i < subReaders.length; i++)
192 total += subReaders[i].docFreq(t);
193 return total;
194 }
195
196 public TermDocs termDocs() throws IOException {
197 return new MultiTermDocs(subReaders, starts);
198 }
199
200 public TermPositions termPositions() throws IOException {
201 return new MultiTermPositions(subReaders, starts);
202 }
203
204 protected void doCommit() throws IOException {
205 for (int i = 0; i < subReaders.length; i++)
206 subReaders[i].commit();
207 }
208
209 protected synchronized void doClose() throws IOException {
210 for (int i = 0; i < subReaders.length; i++)
211 subReaders[i].close();
212 }
213
214 /**
215 * @see IndexReader#getFieldNames()
216 */
217 public Collection getFieldNames() throws IOException {
218 // maintain a unique set of field names
219 Set fieldSet = new HashSet();
220 for (int i = 0; i < subReaders.length; i++) {
221 IndexReader reader = subReaders[i];
222 Collection names = reader.getFieldNames();
223 // iterate through the field names and add them to the set
224 for (Iterator iterator = names.iterator(); iterator.hasNext();) {
225 String s = (String) iterator.next();
226 fieldSet.add(s);
227 }
228 }
229 return fieldSet;
230 }
231
232 /**
233 * @see IndexReader#getFieldNames(boolean)
234 */
235 public Collection getFieldNames(boolean indexed) throws IOException {
236 // maintain a unique set of field names
237 Set fieldSet = new HashSet();
238 for (int i = 0; i < subReaders.length; i++) {
239 IndexReader reader = subReaders[i];
240 Collection names = reader.getFieldNames(indexed);
241 fieldSet.addAll(names);
242 }
243 return fieldSet;
244 }
245
246 public Collection getIndexedFieldNames(boolean storedTermVector) {
247 // maintain a unique set of field names
248 Set fieldSet = new HashSet();
249 for (int i = 0; i < subReaders.length; i++) {
250 IndexReader reader = subReaders[i];
251 Collection names = reader.getIndexedFieldNames(storedTermVector);
252 fieldSet.addAll(names);
253 }
254 return fieldSet;
255 }
256
257 }
258
259 class MultiTermEnum extends TermEnum {
260 private SegmentMergeQueue queue;
261
262 private Term term;
263 private int docFreq;
264
265 public MultiTermEnum(IndexReader[] readers, int[] starts, Term t)
266 throws IOException {
267 queue = new SegmentMergeQueue(readers.length);
268 for (int i = 0; i < readers.length; i++) {
269 IndexReader reader = readers[i];
270 TermEnum termEnum;
271
272 if (t != null) {
273 termEnum = reader.terms(t);
274 } else
275 termEnum = reader.terms();
276
277 SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
278 if (t == null ? smi.next() : termEnum.term() != null)
279 queue.put(smi); // initialize queue
280 else
281 smi.close();
282 }
283
284 if (t != null && queue.size() > 0) {
285 next();
286 }
287 }
288
289 public boolean next() throws IOException {
290 SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
291 if (top == null) {
292 term = null;
293 return false;
294 }
295
296 term = top.term;
297 docFreq = 0;
298 while (top != null && term.indexCompareTo(top.term) == 0) {
299 queue.pop();
300 docFreq += top.termEnum.docFreq(); // increment freq
301 if (top.next())
302 queue.put(top); // restore queue
303 else
304 top.close(); // done with a segment
305 top = (SegmentMergeInfo)queue.top();
306 }
307 return true;
308 }
309
310 public Term term() {
311 return term;
312 }
313
314 public int docFreq() {
315 return docFreq;
316 }
317
318 public void close() throws IOException {
319 queue.close();
320 }
321 }
322
323 class MultiTermDocs implements TermDocs {
324 protected IndexReader[] readers;
325 protected int[] starts;
326 protected Term term;
327
328 protected int base = 0;
329 protected int pointer = 0;
330
331 private TermDocs[] readerTermDocs;
332 protected TermDocs current; // == readerTermDocs[pointer]
333
334 public MultiTermDocs(IndexReader[] r, int[] s) {
335 readers = r;
336 starts = s;
337
338 readerTermDocs = new TermDocs[r.length];
339 }
340
341 public int doc() {
342 return base + current.doc();
343 }
344 public int freq() {
345 return current.freq();
346 }
347
348 public void seek(Term term) {
349 this.term = term;
350 this.base = 0;
351 this.pointer = 0;
352 this.current = null;
353 }
354
355 public void seek(TermEnum termEnum) throws IOException {
356 seek(termEnum.term());
357 }
358
359 public boolean next() throws IOException {
360 if (current != null && current.next()) {
361 return true;
362 } else if (pointer < readers.length) {
363 base = starts[pointer];
364 current = termDocs(pointer++);
365 return next();
366 } else
367 return false;
368 }
369
370 /** Optimized implementation. */
371 public int read(final int[] docs, final int[] freqs) throws IOException {
372 while (true) {
373 while (current == null) {
374 if (pointer < readers.length) { // try next segment
375 base = starts[pointer];
376 current = termDocs(pointer++);
377 } else {
378 return 0;
379 }
380 }
381 int end = current.read(docs, freqs);
382 if (end == 0) { // none left in segment
383 current = null;
384 } else { // got some
385 final int b = base; // adjust doc numbers
386 for (int i = 0; i < end; i++)
387 docs[i] += b;
388 return end;
389 }
390 }
391 }
392
393 /** As yet unoptimized implementation. */
394 public boolean skipTo(int target) throws IOException {
395 do {
396 if (!next())
397 return false;
398 } while (target > doc());
399 return true;
400 }
401
402 private TermDocs termDocs(int i) throws IOException {
403 if (term == null)
404 return null;
405 TermDocs result = readerTermDocs[i];
406 if (result == null)
407 result = readerTermDocs[i] = termDocs(readers[i]);
408 result.seek(term);
409 return result;
410 }
411
412 protected TermDocs termDocs(IndexReader reader)
413 throws IOException {
414 return reader.termDocs();
415 }
416
417 public void close() throws IOException {
418 for (int i = 0; i < readerTermDocs.length; i++) {
419 if (readerTermDocs[i] != null)
420 readerTermDocs[i].close();
421 }
422 }
423 }
424
425 class MultiTermPositions extends MultiTermDocs implements TermPositions {
426 public MultiTermPositions(IndexReader[] r, int[] s) {
427 super(r,s);
428 }
429
430 protected TermDocs termDocs(IndexReader reader) throws IOException {
431 return (TermDocs)reader.termPositions();
432 }
433
434 public int nextPosition() throws IOException {
435 return ((TermPositions)current).nextPosition();
436 }
437
438 }
|