001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020 import java.io.File;
021 import java.util.Collection;
022
023 import gate.creole.annic.apache.lucene.store.Directory;
024 import gate.creole.annic.apache.lucene.store.FSDirectory;
025 import gate.creole.annic.apache.lucene.store.Lock;
026 import gate.creole.annic.apache.lucene.document.Document;
027 import gate.creole.annic.apache.lucene.document.Field; // for javadoc
028 import gate.creole.annic.apache.lucene.search.Similarity;
029
030 /** IndexReader is an abstract class, providing an interface for accessing an
031 index. Search of an index is done entirely through this abstract interface,
032 so that any subclass which implements it is searchable.
033
034 <p> Concrete subclasses of IndexReader are usually constructed with a call to
035 the static method {@link #open}.
036
037 <p> For efficiency, in this API documents are often referred to via
038 <i>document numbers</i>, non-negative integers which each name a unique
039 document in the index. These document numbers are ephemeral--they may change
040 as documents are added to and deleted from an index. Clients should thus not
041 rely on a given document having the same number between sessions.
042
043 @author Doug Cutting
044 @version $Id: IndexReader.java 529 2004-10-05 11:55:26Z niraj $
045 */
046 public abstract class IndexReader {
047
048 /**
049 * Constructor used if IndexReader is not owner of its directory.
050 * This is used for IndexReaders that are used within other IndexReaders that take care or locking directories.
051 *
052 * @param directory Directory where IndexReader files reside.
053 */
054 protected IndexReader(Directory directory) {
055 this.directory = directory;
056 segmentInfos = null;
057 directoryOwner = false;
058 closeDirectory = false;
059 stale = false;
060 hasChanges = false;
061 writeLock = null;
062 }
063
064 /**
065 * Constructor used if IndexReader is owner of its directory.
066 * If IndexReader is owner of its directory, it locks its directory in case of write operations.
067 *
068 * @param directory Directory where IndexReader files reside.
069 * @param segmentInfos Used for write-l
070 * @param closeDirectory
071 */
072 IndexReader(Directory directory, SegmentInfos segmentInfos, boolean closeDirectory) {
073 this.directory = directory;
074 this.segmentInfos = segmentInfos;
075 directoryOwner = true;
076 this.closeDirectory = closeDirectory;
077 stale = false;
078 hasChanges = false;
079 writeLock = null;
080 }
081
082 final private Directory directory;
083
084 final private boolean directoryOwner;
085 final private SegmentInfos segmentInfos;
086 private Lock writeLock;
087 private boolean stale;
088 private boolean hasChanges;
089
090 final private boolean closeDirectory;
091
092 /** Returns an IndexReader reading the index in an FSDirectory in the named
093 path. */
094 public static IndexReader open(String path) throws IOException {
095 return open(FSDirectory.getDirectory(path, false), true);
096 }
097
098 /** Returns an IndexReader reading the index in an FSDirectory in the named
099 path. */
100 public static IndexReader open(File path) throws IOException {
101 return open(FSDirectory.getDirectory(path, false), true);
102 }
103
104 /** Returns an IndexReader reading the index in the given Directory. */
105 public static IndexReader open(final Directory directory) throws IOException {
106 return open(directory, false);
107 }
108
109 private static IndexReader open(final Directory directory, final boolean closeDirectory) throws IOException {
110 synchronized (directory) { // in- & inter-process sync
111 return (IndexReader)new Lock.With(
112 directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
113 IndexWriter.COMMIT_LOCK_TIMEOUT) {
114 public Object doBody() throws IOException {
115 SegmentInfos infos = new SegmentInfos();
116 infos.read(directory);
117 if (infos.size() == 1) { // index is optimized
118 return new SegmentReader(infos, infos.info(0), closeDirectory);
119 } else {
120 IndexReader[] readers = new IndexReader[infos.size()];
121 for (int i = 0; i < infos.size(); i++)
122 readers[i] = new SegmentReader(infos.info(i));
123 return new MultiReader(directory, infos, closeDirectory, readers);
124 }
125 }
126 }.run();
127 }
128 }
129
130 /** Returns the directory this index resides in. */
131 public Directory directory() { return directory; }
132
133 /**
134 * Returns the time the index in the named directory was last modified.
135 *
136 * <p>Synchronization of IndexReader and IndexWriter instances is
137 * no longer done via time stamps of the segments file since the time resolution
138 * depends on the hardware platform. Instead, a version number is maintained
139 * within the segments file, which is incremented everytime when the index is
140 * changed.</p>
141 *
142 * @deprecated Replaced by {@link #getCurrentVersion(String)}
143 * */
144 public static long lastModified(String directory) throws IOException {
145 return lastModified(new File(directory));
146 }
147
148 /**
149 * Returns the time the index in the named directory was last modified.
150 *
151 * <p>Synchronization of IndexReader and IndexWriter instances is
152 * no longer done via time stamps of the segments file since the time resolution
153 * depends on the hardware platform. Instead, a version number is maintained
154 * within the segments file, which is incremented everytime when the index is
155 * changed.</p>
156 *
157 * @deprecated Replaced by {@link #getCurrentVersion(File)}
158 * */
159 public static long lastModified(File directory) throws IOException {
160 return FSDirectory.fileModified(directory, "segments");
161 }
162
163 /**
164 * Returns the time the index in the named directory was last modified.
165 *
166 * <p>Synchronization of IndexReader and IndexWriter instances is
167 * no longer done via time stamps of the segments file since the time resolution
168 * depends on the hardware platform. Instead, a version number is maintained
169 * within the segments file, which is incremented everytime when the index is
170 * changed.</p>
171 *
172 * @deprecated Replaced by {@link #getCurrentVersion(Directory)}
173 * */
174 public static long lastModified(Directory directory) throws IOException {
175 return directory.fileModified("segments");
176 }
177
178 /**
179 * Reads version number from segments files. The version number counts the
180 * number of changes of the index.
181 *
182 * @param directory where the index resides.
183 * @return version number.
184 * @throws IOException if segments file cannot be read
185 */
186 public static long getCurrentVersion(String directory) throws IOException {
187 return getCurrentVersion(new File(directory));
188 }
189
190 /**
191 * Reads version number from segments files. The version number counts the
192 * number of changes of the index.
193 *
194 * @param directory where the index resides.
195 * @return version number.
196 * @throws IOException if segments file cannot be read
197 */
198 public static long getCurrentVersion(File directory) throws IOException {
199 Directory dir = FSDirectory.getDirectory(directory, false);
200 long version = getCurrentVersion(dir);
201 dir.close();
202 return version;
203 }
204
205 /**
206 * Reads version number from segments files. The version number counts the
207 * number of changes of the index.
208 *
209 * @param directory where the index resides.
210 * @return version number.
211 * @throws IOException if segments file cannot be read.
212 */
213 public static long getCurrentVersion(Directory directory) throws IOException {
214 return SegmentInfos.readCurrentVersion(directory);
215 }
216
217 /** Return an array of term frequency vectors for the specified document.
218 * The array contains a vector for each vectorized field in the document.
219 * Each vector contains terms and frequencies for all terms
220 * in a given vectorized field.
221 * If no such fields existed, the method returns null.
222 *
223 * @see Field#isTermVectorStored()
224 */
225 abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
226 throws IOException;
227
228 /** Return a term frequency vector for the specified document and field. The
229 * vector returned contains terms and frequencies for those terms in
230 * the specified field of this document, if the field had storeTermVector
231 * flag set. If the flag was not set, the method returns null.
232 *
233 * @see Field#isTermVectorStored()
234 */
235 abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
236 throws IOException;
237
238 /**
239 * Returns <code>true</code> if an index exists at the specified directory.
240 * If the directory does not exist or if there is no index in it.
241 * <code>false</code> is returned.
242 * @param directory the directory to check for an index
243 * @return <code>true</code> if an index exists; <code>false</code> otherwise
244 */
245 public static boolean indexExists(String directory) {
246 return (new File(directory, "segments")).exists();
247 }
248
249 /**
250 * Returns <code>true</code> if an index exists at the specified directory.
251 * If the directory does not exist or if there is no index in it.
252 * @param directory the directory to check for an index
253 * @return <code>true</code> if an index exists; <code>false</code> otherwise
254 */
255 public static boolean indexExists(File directory) {
256 return (new File(directory, "segments")).exists();
257 }
258
259 /**
260 * Returns <code>true</code> if an index exists at the specified directory.
261 * If the directory does not exist or if there is no index in it.
262 * @param directory the directory to check for an index
263 * @return <code>true</code> if an index exists; <code>false</code> otherwise
264 * @throws IOException if there is a problem with accessing the index
265 */
266 public static boolean indexExists(Directory directory) throws IOException {
267 return directory.fileExists("segments");
268 }
269
270 /** Returns the number of documents in this index. */
271 public abstract int numDocs();
272
273 /** Returns one greater than the largest possible document number.
274 This may be used to, e.g., determine how big to allocate an array which
275 will have an element for every document number in an index.
276 */
277 public abstract int maxDoc();
278
279 /** Returns the stored fields of the <code>n</code><sup>th</sup>
280 <code>Document</code> in this index. */
281 public abstract Document document(int n) throws IOException;
282
283 /** Returns true if document <i>n</i> has been deleted */
284 public abstract boolean isDeleted(int n);
285
286 /** Returns true if any documents have been deleted */
287 public abstract boolean hasDeletions();
288
289 /** Returns the byte-encoded normalization factor for the named field of
290 * every document. This is used by the search code to score documents.
291 *
292 * @see Field#setBoost(float)
293 */
294 public abstract byte[] norms(String field) throws IOException;
295
296 /** Reads the byte-encoded normalization factor for the named field of every
297 * document. This is used by the search code to score documents.
298 *
299 * @see Field#setBoost(float)
300 */
301 public abstract void norms(String field, byte[] bytes, int offset)
302 throws IOException;
303
304 /** Expert: Resets the normalization factor for the named field of the named
305 * document. The norm represents the product of the field's {@link
306 * Field#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
307 * int) length normalization}. Thus, to preserve the length normalization
308 * values when resetting this, one should base the new value upon the old.
309 *
310 * @see #norms(String)
311 * @see Similarity#decodeNorm(byte)
312 */
313 public final synchronized void setNorm(int doc, String field, byte value)
314 throws IOException{
315 if(directoryOwner)
316 aquireWriteLock();
317 doSetNorm(doc, field, value);
318 hasChanges = true;
319 }
320
321 /** Implements setNorm in subclass.*/
322 protected abstract void doSetNorm(int doc, String field, byte value)
323 throws IOException;
324
325 /** Expert: Resets the normalization factor for the named field of the named
326 * document.
327 *
328 * @see #norms(String)
329 * @see Similarity#decodeNorm(byte)
330 */
331 public void setNorm(int doc, String field, float value)
332 throws IOException {
333 setNorm(doc, field, Similarity.encodeNorm(value));
334 }
335
336
337 /** Returns an enumeration of all the terms in the index.
338 The enumeration is ordered by Term.compareTo(). Each term
339 is greater than all that precede it in the enumeration.
340 */
341 public abstract TermEnum terms() throws IOException;
342
343 /** Returns an enumeration of all terms after a given term.
344 The enumeration is ordered by Term.compareTo(). Each term
345 is greater than all that precede it in the enumeration.
346 */
347 public abstract TermEnum terms(Term t) throws IOException;
348
349 /** Returns the number of documents containing the term <code>t</code>. */
350 public abstract int docFreq(Term t) throws IOException;
351
352 /** Returns an enumeration of all the documents which contain
353 <code>term</code>. For each document, the document number, the frequency of
354 the term in that document is also provided, for use in search scoring.
355 Thus, this method implements the mapping:
356 <p><ul>
357 Term => <docNum, freq><sup>*</sup>
358 </ul>
359 <p>The enumeration is ordered by document number. Each document number
360 is greater than all that precede it in the enumeration.
361 */
362 public TermDocs termDocs(Term term) throws IOException {
363 TermDocs termDocs = termDocs();
364 termDocs.seek(term);
365 return termDocs;
366 }
367
368 /** Returns an unpositioned {@link TermDocs} enumerator. */
369 public abstract TermDocs termDocs() throws IOException;
370
371 /** Returns an enumeration of all the documents which contain
372 <code>term</code>. For each document, in addition to the document number
373 and frequency of the term in that document, a list of all of the ordinal
374 positions of the term in the document is available. Thus, this method
375 implements the mapping:
376
377 <p><ul>
378 Term => <docNum, freq,
379 <pos<sub>1</sub>, pos<sub>2</sub>, ...
380 pos<sub>freq-1</sub>>
381 ><sup>*</sup>
382 </ul>
383 <p> This positional information faciliates phrase and proximity searching.
384 <p>The enumeration is ordered by document number. Each document number is
385 greater than all that precede it in the enumeration.
386 */
387 public TermPositions termPositions(Term term) throws IOException {
388 TermPositions termPositions = termPositions();
389 termPositions.seek(term);
390 return termPositions;
391 }
392
393 /** Returns an unpositioned {@link TermPositions} enumerator. */
394 public abstract TermPositions termPositions() throws IOException;
395
396 /**
397 * Trys to acquire the WriteLock on this directory.
398 * this method is only valid if this IndexReader is directory owner.
399 *
400 * @throws IOException If WriteLock cannot be acquired.
401 */
402 private void aquireWriteLock() throws IOException {
403 if (stale)
404 throw new IOException("IndexReader out of date and no longer valid for delete, undelete, or setNorm operations");
405
406 if (writeLock == null) {
407 Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
408 if (!writeLock.obtain(IndexWriter.WRITE_LOCK_TIMEOUT)) // obtain write lock
409 throw new IOException("Index locked for write: " + writeLock);
410 this.writeLock = writeLock;
411
412 // we have to check whether index has changed since this reader was opened.
413 // if so, this reader is no longer valid for deletion
414 if (SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) {
415 stale = true;
416 this.writeLock.release();
417 this.writeLock = null;
418 throw new IOException("IndexReader out of date and no longer valid for delete, undelete, or setNorm operations");
419 }
420 }
421 }
422
423 /** Deletes the document numbered <code>docNum</code>. Once a document is
424 deleted it will not appear in TermDocs or TermPostitions enumerations.
425 Attempts to read its field with the {@link #document}
426 method will result in an error. The presence of this document may still be
427 reflected in the {@link #docFreq} statistic, though
428 this will be corrected eventually as the index is further modified.
429 */
430 public final synchronized void delete(int docNum) throws IOException {
431 if(directoryOwner)
432 aquireWriteLock();
433 doDelete(docNum);
434 hasChanges = true;
435 }
436
437 /** Implements deletion of the document numbered <code>docNum</code>.
438 * Applications should call {@link #delete(int)} or {@link #delete(Term)}.
439 */
440 protected abstract void doDelete(int docNum) throws IOException;
441
442 /** Deletes all documents containing <code>term</code>.
443 This is useful if one uses a document field to hold a unique ID string for
444 the document. Then to delete such a document, one merely constructs a
445 term with the appropriate field and the unique ID string as its text and
446 passes it to this method. Returns the number of documents deleted.
447 */
448 public final int delete(Term term) throws IOException {
449 TermDocs docs = termDocs(term);
450 if (docs == null) return 0;
451 int n = 0;
452 try {
453 while (docs.next()) {
454 delete(docs.doc());
455 n++;
456 }
457 } finally {
458 docs.close();
459 }
460 return n;
461 }
462
463 /** Undeletes all documents currently marked as deleted in this index.*/
464 public final synchronized void undeleteAll() throws IOException{
465 if(directoryOwner)
466 aquireWriteLock();
467 doUndeleteAll();
468 hasChanges = true;
469 }
470
471 /** Implements actual undeleteAll() in subclass. */
472 protected abstract void doUndeleteAll() throws IOException;
473
474 /**
475 * Commit changes resulting from delete, undeleteAll, or setNorm operations
476 *
477 * @throws IOException
478 */
479 protected final synchronized void commit() throws IOException{
480 if(hasChanges){
481 if(directoryOwner){
482 synchronized (directory) { // in- & inter-process sync
483 new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
484 IndexWriter.COMMIT_LOCK_TIMEOUT) {
485 public Object doBody() throws IOException {
486 doCommit();
487 segmentInfos.write(directory);
488 return null;
489 }
490 }.run();
491 }
492 if (writeLock != null) {
493 writeLock.release(); // release write lock
494 writeLock = null;
495 }
496 }
497 else
498 doCommit();
499 }
500 hasChanges = false;
501 }
502
503 /** Implements commit. */
504 protected abstract void doCommit() throws IOException;
505
506 /**
507 * Closes files associated with this index.
508 * Also saves any new deletions to disk.
509 * No other methods should be called after this has been called.
510 */
511 public final synchronized void close() throws IOException {
512 commit();
513 doClose();
514 if(closeDirectory)
515 directory.close();
516 }
517
518 /** Implements close. */
519 protected abstract void doClose() throws IOException;
520
521 /** Release the write lock, if needed. */
522 protected final void finalize() throws IOException {
523 if (writeLock != null) {
524 writeLock.release(); // release write lock
525 writeLock = null;
526 }
527 }
528
529 /**
530 * Returns a list of all unique field names that exist in the index pointed
531 * to by this IndexReader.
532 * @return Collection of Strings indicating the names of the fields
533 * @throws IOException if there is a problem with accessing the index
534 */
535 public abstract Collection getFieldNames() throws IOException;
536
537 /**
538 * Returns a list of all unique field names that exist in the index pointed
539 * to by this IndexReader. The boolean argument specifies whether the fields
540 * returned are indexed or not.
541 * @param indexed <code>true</code> if only indexed fields should be returned;
542 * <code>false</code> if only unindexed fields should be returned.
543 * @return Collection of Strings indicating the names of the fields
544 * @throws IOException if there is a problem with accessing the index
545 */
546 public abstract Collection getFieldNames(boolean indexed) throws IOException;
547
548 /**
549 *
550 * @param storedTermVector if true, returns only Indexed fields that have term vector info,
551 * else only indexed fields without term vector info
552 * @return Collection of Strings indicating the names of the fields
553 */
554 public abstract Collection getIndexedFieldNames(boolean storedTermVector);
555
556 /**
557 * Returns <code>true</code> iff the index in the named directory is
558 * currently locked.
559 * @param directory the directory to check for a lock
560 * @throws IOException if there is a problem with accessing the index
561 */
562 public static boolean isLocked(Directory directory) throws IOException {
563 return
564 directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
565 directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
566
567 }
568
569 /**
570 * Returns <code>true</code> iff the index in the named directory is
571 * currently locked.
572 * @param directory the directory to check for a lock
573 * @throws IOException if there is a problem with accessing the index
574 */
575 public static boolean isLocked(String directory) throws IOException {
576 Directory dir = FSDirectory.getDirectory(directory, false);
577 boolean result = isLocked(dir);
578 dir.close();
579 return result;
580 }
581
582 /**
583 * Forcibly unlocks the index in the named directory.
584 * <P>
585 * Caution: this should only be used by failure recovery code,
586 * when it is known that no other process nor thread is in fact
587 * currently accessing this index.
588 */
589 public static void unlock(Directory directory) throws IOException {
590 directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
591 directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).release();
592 }
593 }
|