001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020 import java.io.File;
021 import java.io.PrintStream;
022 import java.util.Vector;
023
024 import gate.creole.annic.apache.lucene.store.Directory;
025 import gate.creole.annic.apache.lucene.store.RAMDirectory;
026 import gate.creole.annic.apache.lucene.store.FSDirectory;
027 import gate.creole.annic.apache.lucene.store.Lock;
028 import gate.creole.annic.apache.lucene.store.InputStream;
029 import gate.creole.annic.apache.lucene.store.OutputStream;
030 import gate.creole.annic.apache.lucene.search.Similarity;
031 import gate.creole.annic.apache.lucene.document.Document;
032 import gate.creole.annic.apache.lucene.analysis.Analyzer;
033
034
035 /**
036 An IndexWriter creates and maintains an index.
037
038 The third argument to the <a href="#IndexWriter"><b>constructor</b></a>
039 determines whether a new index is created, or whether an existing index is
040 opened for the addition of new documents.
041
042 In either case, documents are added with the <a
043 href="#addDocument"><b>addDocument</b></a> method. When finished adding
044 documents, <a href="#close"><b>close</b></a> should be called.
045
046 If an index will not have more documents added for a while and optimal search
047 performance is desired, then the <a href="#optimize"><b>optimize</b></a>
048 method should be called before the index is closed.
049 */
050
051 public class IndexWriter {
052
053 /**
054 * Default value is 1000. Use <code>gate.creole.annic.apache.lucene.writeLockTimeout</code>
055 * system property to override.
056 */
057 public static long WRITE_LOCK_TIMEOUT =
058 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.writeLockTimeout",
059 "1000"));
060
061 /**
062 * Default value is 10000. Use <code>gate.creole.annic.apache.lucene.commitLockTimeout</code>
063 * system property to override.
064 */
065 public static long COMMIT_LOCK_TIMEOUT =
066 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.commitLockTimeout",
067 "10000"));
068
069 public static final String WRITE_LOCK_NAME = "write.lock";
070 public static final String COMMIT_LOCK_NAME = "commit.lock";
071
072 /**
073 * Default value is 10. Use <code>gate.creole.annic.apache.lucene.mergeFactor</code>
074 * system property to override.
075 */
076 public static final int DEFAULT_MERGE_FACTOR =
077 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.mergeFactor",
078 "10"));
079
080 /**
081 * Default value is 10. Use <code>gate.creole.annic.apache.lucene.minMergeDocs</code>
082 * system property to override.
083 */
084 public static final int DEFAULT_MIN_MERGE_DOCS =
085 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.minMergeDocs",
086 "10"));
087
088 /**
089 * Default value is {@link Integer#MAX_VALUE}.
090 * Use <code>gate.creole.annic.apache.lucene.maxMergeDocs</code> system property to override.
091 */
092 public static final int DEFAULT_MAX_MERGE_DOCS =
093 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.maxMergeDocs",
094 String.valueOf(Integer.MAX_VALUE)));
095
096 /**
097 * Default value is 10000. Use <code>gate.creole.annic.apache.lucene.maxFieldLength</code>
098 * system property to override.
099 */
100 public static final int DEFAULT_MAX_FIELD_LENGTH =
101 Integer.parseInt(System.getProperty("gate.creole.annic.apache.lucene.maxFieldLength",
102 "300000"));
103
104
105 private Directory directory; // where this index resides
106 private Analyzer analyzer; // how to analyze text
107
108 private Similarity similarity = Similarity.getDefault(); // how to normalize
109
110 private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
111 private final Directory ramDirectory = new RAMDirectory(); // for temp segs
112
113 private Lock writeLock;
114
115 /** Use compound file setting. Defaults to true, minimizing the number of
116 * files used. Setting this to false may improve indexing performance, but
117 * may also cause file handle problems.
118 */
119 private boolean useCompoundFile = true;
120
121 private boolean closeDir;
122
123 /** Setting to turn on usage of a compound file. When on, multiple files
124 * for each segment are merged into a single file once the segment creation
125 * is finished. This is done regardless of what directory is in use.
126 */
127 public boolean getUseCompoundFile() {
128 return useCompoundFile;
129 }
130
131 /** Setting to turn on usage of a compound file. When on, multiple files
132 * for each segment are merged into a single file once the segment creation
133 * is finished. This is done regardless of what directory is in use.
134 */
135 public void setUseCompoundFile(boolean value) {
136 useCompoundFile = value;
137 }
138
139
140 /** Expert: Set the Similarity implementation used by this IndexWriter.
141 *
142 * @see Similarity#setDefault(Similarity)
143 */
144 public void setSimilarity(Similarity similarity) {
145 this.similarity = similarity;
146 }
147
148 /** Expert: Return the Similarity implementation used by this IndexWriter.
149 *
150 * <p>This defaults to the current value of {@link Similarity#getDefault()}.
151 */
152 public Similarity getSimilarity() {
153 return this.similarity;
154 }
155
156 /**
157 * Constructs an IndexWriter for the index in <code>path</code>.
158 * Text will be analyzed with <code>a</code>. If <code>create</code>
159 * is true, then a new, empty index will be created in
160 * <code>path</code>, replacing the index already there, if any.
161 *
162 * @param path the path to the index directory
163 * @param a the analyzer to use
164 * @param create <code>true</code> to create the index or overwrite
165 * the existing one; <code>false</code> to append to the existing
166 * index
167 * @throws IOException if the directory cannot be read/written to, or
168 * if it does not exist, and <code>create</code> is
169 * <code>false</code>
170 */
171 public IndexWriter(String path, Analyzer a, boolean create)
172 throws IOException {
173 this(FSDirectory.getDirectory(path, create), a, create, true);
174 }
175
176 /**
177 * Constructs an IndexWriter for the index in <code>path</code>.
178 * Text will be analyzed with <code>a</code>. If <code>create</code>
179 * is true, then a new, empty index will be created in
180 * <code>path</code>, replacing the index already there, if any.
181 *
182 * @param path the path to the index directory
183 * @param a the analyzer to use
184 * @param create <code>true</code> to create the index or overwrite
185 * the existing one; <code>false</code> to append to the existing
186 * index
187 * @throws IOException if the directory cannot be read/written to, or
188 * if it does not exist, and <code>create</code> is
189 * <code>false</code>
190 */
191 public IndexWriter(File path, Analyzer a, boolean create)
192 throws IOException {
193 this(FSDirectory.getDirectory(path, create), a, create, true);
194 }
195
196 /**
197 * Constructs an IndexWriter for the index in <code>d</code>.
198 * Text will be analyzed with <code>a</code>. If <code>create</code>
199 * is true, then a new, empty index will be created in
200 * <code>d</code>, replacing the index already there, if any.
201 *
202 * @param d the index directory
203 * @param a the analyzer to use
204 * @param create <code>true</code> to create the index or overwrite
205 * the existing one; <code>false</code> to append to the existing
206 * index
207 * @throws IOException if the directory cannot be read/written to, or
208 * if it does not exist, and <code>create</code> is
209 * <code>false</code>
210 */
211 public IndexWriter(Directory d, Analyzer a, boolean create)
212 throws IOException {
213 this(d, a, create, false);
214 }
215
216 private IndexWriter(Directory d, Analyzer a, final boolean create, boolean closeDir)
217 throws IOException {
218 this.closeDir = closeDir;
219 directory = d;
220 analyzer = a;
221
222 Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
223 if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
224 throw new IOException("Index locked for write: " + writeLock);
225 this.writeLock = writeLock; // save it
226
227 synchronized (directory) { // in- & inter-process sync
228 new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
229 public Object doBody() throws IOException {
230 if (create)
231 segmentInfos.write(directory);
232 else
233 segmentInfos.read(directory);
234 return null;
235 }
236 }.run();
237 }
238 }
239
240 /** Flushes all changes to an index and closes all associated files. */
241 public synchronized void close() throws IOException {
242 flushRamSegments();
243 ramDirectory.close();
244 writeLock.release(); // release write lock
245 writeLock = null;
246 if(closeDir)
247 directory.close();
248 }
249
250 /** Release the write lock, if needed. */
251 protected void finalize() throws IOException {
252 if (writeLock != null) {
253 writeLock.release(); // release write lock
254 writeLock = null;
255 }
256 }
257
258 /** Returns the analyzer used by this index. */
259 public Analyzer getAnalyzer() {
260 return analyzer;
261 }
262
263
264 /** Returns the number of documents currently in this index. */
265 public synchronized int docCount() {
266 int count = 0;
267 for (int i = 0; i < segmentInfos.size(); i++) {
268 SegmentInfo si = segmentInfos.info(i);
269 count += si.docCount;
270 }
271 return count;
272 }
273
274 /**
275 * The maximum number of terms that will be indexed for a single field in a
276 * document. This limits the amount of memory required for indexing, so that
277 * collections with very large files will not crash the indexing process by
278 * running out of memory.<p/>
279 * Note that this effectively truncates large documents, excluding from the
280 * index terms that occur further in the document. If you know your source
281 * documents are large, be sure to set this value high enough to accomodate
282 * the expected size. If you set it to Integer.MAX_VALUE, then the only limit
283 * is your memory, but you should anticipate an OutOfMemoryError.<p/>
284 * By default, no more than 10,000 terms will be indexed for a field.
285 */
286 public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH;
287
288 /**
289 * Adds a document to this index. If the document contains more than
290 * {@link #maxFieldLength} terms for a given field, the remainder are
291 * discarded.
292 */
293 public void addDocument(Document doc) throws IOException {
294 addDocument(doc, analyzer);
295 }
296
297 /**
298 * Adds a document to this index, using the provided analyzer instead of the
299 * value of {@link #getAnalyzer()}. If the document contains more than
300 * {@link #maxFieldLength} terms for a given field, the remainder are
301 * discarded.
302 */
303 public void addDocument(Document doc, Analyzer analyzer) throws IOException {
304 DocumentWriter dw =
305 new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
306 String segmentName = newSegmentName();
307 dw.addDocument(segmentName, doc);
308 synchronized (this) {
309 segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory));
310 maybeMergeSegments();
311 }
312 }
313
314 final int getSegmentsCounter(){
315 return segmentInfos.counter;
316 }
317
318 private final synchronized String newSegmentName() {
319 return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
320 }
321
322 /** Determines how often segment indices are merged by addDocument(). With
323 * smaller values, less RAM is used while indexing, and searches on
324 * unoptimized indices are faster, but indexing speed is slower. With larger
325 * values, more RAM is used during indexing, and while searches on unoptimized
326 * indices are slower, indexing is faster. Thus larger values (> 10) are best
327 * for batch index creation, and smaller values (< 10) for indices that are
328 * interactively maintained.
329 *
330 * <p>This must never be less than 2. The default value is 10.*/
331 public int mergeFactor = DEFAULT_MERGE_FACTOR;
332
333 /** Determines the minimal number of documents required before the buffered
334 * in-memory documents are merging and a new Segment is created.
335 * Since Documents are merged in a {@link gate.creole.annic.apache.lucene.store.RAMDirectory},
336 * large value gives faster indexing. At the same time, mergeFactor limits
337 * the number of files open in a FSDirectory.
338 *
339 * <p> The default value is 10.*/
340 public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS;
341
342
343 /** Determines the largest number of documents ever merged by addDocument().
344 * Small values (e.g., less than 10,000) are best for interactive indexing,
345 * as this limits the length of pauses while indexing to a few seconds.
346 * Larger values are best for batched indexing and speedier searches.
347 *
348 * <p>The default value is {@link Integer#MAX_VALUE}. */
349 public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
350
351 /** If non-null, information about merges will be printed to this. */
352 public PrintStream infoStream = null;
353
354 /** Merges all segments together into a single segment, optimizing an index
355 for search. */
356 public synchronized void optimize() throws IOException {
357 flushRamSegments();
358 while (segmentInfos.size() > 1 ||
359 (segmentInfos.size() == 1 &&
360 (SegmentReader.hasDeletions(segmentInfos.info(0)) ||
361 segmentInfos.info(0).dir != directory ||
362 (useCompoundFile &&
363 (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
364 SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
365 int minSegment = segmentInfos.size() - mergeFactor;
366 mergeSegments(minSegment < 0 ? 0 : minSegment);
367 }
368 }
369
370 /** Merges all segments from an array of indexes into this index.
371 *
372 * <p>This may be used to parallelize batch indexing. A large document
373 * collection can be broken into sub-collections. Each sub-collection can be
374 * indexed in parallel, on a different thread, process or machine. The
375 * complete index can then be created by merging sub-collection indexes
376 * with this method.
377 *
378 * <p>After this completes, the index is optimized. */
379 public synchronized void addIndexes(Directory[] dirs)
380 throws IOException {
381 optimize(); // start with zero or 1 seg
382 for (int i = 0; i < dirs.length; i++) {
383 SegmentInfos sis = new SegmentInfos(); // read infos from dir
384 sis.read(dirs[i]);
385 for (int j = 0; j < sis.size(); j++) {
386 segmentInfos.addElement(sis.info(j)); // add each info
387 }
388 }
389 optimize(); // final cleanup
390 }
391
392 /** Merges the provided indexes into this index.
393 * <p>After this completes, the index is optimized. </p>
394 * <p>The provided IndexReaders are not closed.</p>
395 */
396 public synchronized void addIndexes(IndexReader[] readers)
397 throws IOException {
398
399 optimize(); // start with zero or 1 seg
400
401 String mergedName = newSegmentName();
402 SegmentMerger merger = new SegmentMerger(directory, mergedName, false);
403
404 if (segmentInfos.size() == 1) // add existing index, if any
405 merger.add(new SegmentReader(segmentInfos.info(0)));
406
407 for (int i = 0; i < readers.length; i++) // add new indexes
408 merger.add(readers[i]);
409
410 int docCount = merger.merge(); // merge 'em
411
412 segmentInfos.setSize(0); // pop old infos & add new
413 segmentInfos.addElement(new SegmentInfo(mergedName, docCount, directory));
414
415 synchronized (directory) { // in- & inter-process sync
416 new Lock.With(directory.makeLock("commit.lock"), COMMIT_LOCK_TIMEOUT) {
417 public Object doBody() throws IOException {
418 segmentInfos.write(directory); // commit changes
419 return null;
420 }
421 }.run();
422 }
423 }
424
425 /** Merges all RAM-resident segments. */
426 private final void flushRamSegments() throws IOException {
427 int minSegment = segmentInfos.size()-1;
428 int docCount = 0;
429 while (minSegment >= 0 &&
430 (segmentInfos.info(minSegment)).dir == ramDirectory) {
431 docCount += segmentInfos.info(minSegment).docCount;
432 minSegment--;
433 }
434 if (minSegment < 0 || // add one FS segment?
435 (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor ||
436 !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory))
437 minSegment++;
438 if (minSegment >= segmentInfos.size())
439 return; // none to merge
440 mergeSegments(minSegment);
441 }
442
443 /** Incremental segment merger. */
444 private final void maybeMergeSegments() throws IOException {
445 long targetMergeDocs = minMergeDocs;
446 while (targetMergeDocs <= maxMergeDocs) {
447 // find segments smaller than current target size
448 int minSegment = segmentInfos.size();
449 int mergeDocs = 0;
450 while (--minSegment >= 0) {
451 SegmentInfo si = segmentInfos.info(minSegment);
452 if (si.docCount >= targetMergeDocs)
453 break;
454 mergeDocs += si.docCount;
455 }
456
457 if (mergeDocs >= targetMergeDocs) // found a merge to do
458 mergeSegments(minSegment+1);
459 else
460 break;
461
462 targetMergeDocs *= mergeFactor; // increase target size
463 }
464 }
465
466 /** Pops segments off of segmentInfos stack down to minSegment, merges them,
467 and pushes the merged index onto the top of the segmentInfos stack. */
468 private final void mergeSegments(int minSegment)
469 throws IOException {
470 String mergedName = newSegmentName();
471 if (infoStream != null) infoStream.print("merging segments");
472 SegmentMerger merger =
473 new SegmentMerger(directory, mergedName, useCompoundFile);
474
475 final Vector segmentsToDelete = new Vector();
476 for (int i = minSegment; i < segmentInfos.size(); i++) {
477 SegmentInfo si = segmentInfos.info(i);
478 if (infoStream != null)
479 infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
480 IndexReader reader = new SegmentReader(si);
481 merger.add(reader);
482 if ((reader.directory() == this.directory) || // if we own the directory
483 (reader.directory() == this.ramDirectory))
484 segmentsToDelete.addElement(reader); // queue segment for deletion
485 }
486
487 int mergedDocCount = merger.merge();
488
489 if (infoStream != null) {
490 infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
491 }
492
493 segmentInfos.setSize(minSegment); // pop old infos & add new
494 segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount,
495 directory));
496
497 // close readers before we attempt to delete now-obsolete segments
498 merger.closeReaders();
499
500 synchronized (directory) { // in- & inter-process sync
501 new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
502 public Object doBody() throws IOException {
503 segmentInfos.write(directory); // commit before deleting
504 deleteSegments(segmentsToDelete); // delete now-unused segments
505 return null;
506 }
507 }.run();
508 }
509
510 }
511
512 /* Some operating systems (e.g. Windows) don't permit a file to be deleted
513 while it is opened for read (e.g. by another process or thread). So we
514 assume that when a delete fails it is because the file is open in another
515 process, and queue the file for subsequent deletion. */
516
517 private final void deleteSegments(Vector segments) throws IOException {
518 Vector deletable = new Vector();
519
520 deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable
521
522 for (int i = 0; i < segments.size(); i++) {
523 SegmentReader reader = (SegmentReader)segments.elementAt(i);
524 if (reader.directory() == this.directory)
525 deleteFiles(reader.files(), deletable); // try to delete our files
526 else
527 deleteFiles(reader.files(), reader.directory()); // delete other files
528 }
529
530 writeDeleteableFiles(deletable); // note files we can't delete
531 }
532
533 private final void deleteFiles(Vector files, Directory directory)
534 throws IOException {
535 for (int i = 0; i < files.size(); i++) {
536 directory.deleteFile((String)files.elementAt(i));
537 }
538 }
539
540 private final void deleteFiles(Vector files, Vector deletable)
541 throws IOException {
542 for (int i = 0; i < files.size(); i++) {
543 String file = (String)files.elementAt(i);
544 try {
545 directory.deleteFile(file); // try to delete each file
546 } catch (IOException e) { // if delete fails
547 if (directory.fileExists(file)) {
548 if (infoStream != null)
549 infoStream.println(e.getMessage() + "; Will re-try later.");
550 deletable.addElement(file); // add to deletable
551 }
552 }
553 }
554 }
555
556 private final Vector readDeleteableFiles() throws IOException {
557 Vector result = new Vector();
558 if (!directory.fileExists("deletable"))
559 return result;
560
561 InputStream input = directory.openFile("deletable");
562 try {
563 for (int i = input.readInt(); i > 0; i--) // read file names
564 result.addElement(input.readString());
565 } finally {
566 input.close();
567 }
568 return result;
569 }
570
571 private final void writeDeleteableFiles(Vector files) throws IOException {
572 OutputStream output = directory.createFile("deleteable.new");
573 try {
574 output.writeInt(files.size());
575 for (int i = 0; i < files.size(); i++)
576 output.writeString((String)files.elementAt(i));
577 } finally {
578 output.close();
579 }
580 directory.renameFile("deleteable.new", "deletable");
581 }
582 }
|