001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.util.Vector;
020 import java.util.ArrayList;
021 import java.util.Iterator;
022 import java.io.IOException;
023
024 import gate.creole.annic.apache.lucene.store.Directory;
025 import gate.creole.annic.apache.lucene.store.OutputStream;
026 import gate.creole.annic.apache.lucene.store.RAMOutputStream;
027
028 /**
029 * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
030 * into a single Segment. After adding the appropriate readers, call the merge method to combine the
031 * segments.
032 *<P>
033 * If the compoundFile flag is set, then the segments will be merged into a compound file.
034 *
035 *
036 * @see #merge
037 * @see #add
038 */
039 final class SegmentMerger {
040 private boolean useCompoundFile;
041 private Directory directory;
042 private String segment;
043
044 private Vector readers = new Vector();
045 private FieldInfos fieldInfos;
046
047 // File extensions of old-style index files
048 private static final String COMPOUND_EXTENSIONS[] = new String[] {
049 "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
050 };
051 private static final String VECTOR_EXTENSIONS[] = new String[] {
052 "tvx", "tvd", "tvf"
053 };
054
055 /**
056 *
057 * @param dir The Directory to merge the other segments into
058 * @param name The name of the new segment
059 * @param compoundFile true if the new segment should use a compoundFile
060 */
061 SegmentMerger(Directory dir, String name, boolean compoundFile) {
062 directory = dir;
063 segment = name;
064 useCompoundFile = compoundFile;
065 }
066
067 /**
068 * Add an IndexReader to the collection of readers that are to be merged
069 * @param reader
070 */
071 final void add(IndexReader reader) {
072 readers.addElement(reader);
073 }
074
075 /**
076 *
077 * @param i The index of the reader to return
078 * @return The ith reader to be merged
079 */
080 final IndexReader segmentReader(int i) {
081 return (IndexReader) readers.elementAt(i);
082 }
083
084 /**
085 * Merges the readers specified by the {@link #add} method into the directory passed to the constructor
086 * @return The number of documents that were merged
087 * @throws IOException
088 */
089 final int merge() throws IOException {
090 int value;
091
092 value = mergeFields();
093 mergeTerms();
094 mergeNorms();
095
096 if (fieldInfos.hasVectors())
097 mergeVectors();
098
099 if (useCompoundFile)
100 createCompoundFile();
101
102 return value;
103 }
104
105 /**
106 * close all IndexReaders that have been added.
107 * Should not be called before merge().
108 * @throws IOException
109 */
110 final void closeReaders() throws IOException {
111 for (int i = 0; i < readers.size(); i++) { // close readers
112 IndexReader reader = (IndexReader) readers.elementAt(i);
113 reader.close();
114 }
115 }
116
117 private final void createCompoundFile()
118 throws IOException {
119 CompoundFileWriter cfsWriter =
120 new CompoundFileWriter(directory, segment + ".cfs");
121
122 ArrayList files =
123 new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size());
124
125 // Basic files
126 for (int i = 0; i < COMPOUND_EXTENSIONS.length; i++) {
127 files.add(segment + "." + COMPOUND_EXTENSIONS[i]);
128 }
129
130 // Field norm files
131 for (int i = 0; i < fieldInfos.size(); i++) {
132 FieldInfo fi = fieldInfos.fieldInfo(i);
133 if (fi.isIndexed) {
134 files.add(segment + ".f" + i);
135 }
136 }
137
138 // Vector files
139 if (fieldInfos.hasVectors()) {
140 for (int i = 0; i < VECTOR_EXTENSIONS.length; i++) {
141 files.add(segment + "." + VECTOR_EXTENSIONS[i]);
142 }
143 }
144
145 // Now merge all added files
146 Iterator it = files.iterator();
147 while (it.hasNext()) {
148 cfsWriter.addFile((String) it.next());
149 }
150
151 // Perform the merge
152 cfsWriter.close();
153
154 // Now delete the source files
155 it = files.iterator();
156 while (it.hasNext()) {
157 directory.deleteFile((String) it.next());
158 }
159 }
160
161 /**
162 *
163 * @return The number of documents in all of the readers
164 * @throws IOException
165 */
166 private final int mergeFields() throws IOException {
167 fieldInfos = new FieldInfos(); // merge field names
168 int docCount = 0;
169 for (int i = 0; i < readers.size(); i++) {
170 IndexReader reader = (IndexReader) readers.elementAt(i);
171 fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true);
172 fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false);
173 fieldInfos.add(reader.getFieldNames(false), false);
174 }
175 fieldInfos.write(directory, segment + ".fnm");
176
177 FieldsWriter fieldsWriter = // merge field values
178 new FieldsWriter(directory, segment, fieldInfos);
179 try {
180 for (int i = 0; i < readers.size(); i++) {
181 IndexReader reader = (IndexReader) readers.elementAt(i);
182 int maxDoc = reader.maxDoc();
183 for (int j = 0; j < maxDoc; j++)
184 if (!reader.isDeleted(j)) { // skip deleted docs
185 fieldsWriter.addDocument(reader.document(j));
186 docCount++;
187 }
188 }
189 } finally {
190 fieldsWriter.close();
191 }
192 return docCount;
193 }
194
195 /**
196 * Merge the TermVectors from each of the segments into the new one.
197 * @throws IOException
198 */
199 private final void mergeVectors() throws IOException {
200 TermVectorsWriter termVectorsWriter =
201 new TermVectorsWriter(directory, segment, fieldInfos);
202
203 try {
204 for (int r = 0; r < readers.size(); r++) {
205 IndexReader reader = (IndexReader) readers.elementAt(r);
206 int maxDoc = reader.maxDoc();
207 for (int docNum = 0; docNum < maxDoc; docNum++) {
208 // skip deleted docs
209 if (reader.isDeleted(docNum)) {
210 continue;
211 }
212 termVectorsWriter.openDocument();
213
214 // get all term vectors
215 TermFreqVector[] sourceTermVector =
216 reader.getTermFreqVectors(docNum);
217
218 if (sourceTermVector != null) {
219 for (int f = 0; f < sourceTermVector.length; f++) {
220 // translate field numbers
221 TermFreqVector termVector = sourceTermVector[f];
222 termVectorsWriter.openField(termVector.getField());
223 String [] terms = termVector.getTerms();
224 int [] freqs = termVector.getTermFrequencies();
225
226 for (int t = 0; t < terms.length; t++) {
227 termVectorsWriter.addTerm(terms[t], freqs[t]);
228 }
229 }
230 termVectorsWriter.closeDocument();
231 }
232 }
233 }
234 } finally {
235 termVectorsWriter.close();
236 }
237 }
238
239 private OutputStream freqOutput = null;
240 private OutputStream proxOutput = null;
241 private TermInfosWriter termInfosWriter = null;
242 private int skipInterval;
243 private SegmentMergeQueue queue = null;
244
245 private final void mergeTerms() throws IOException {
246 try {
247 freqOutput = directory.createFile(segment + ".frq");
248 proxOutput = directory.createFile(segment + ".prx");
249 termInfosWriter =
250 new TermInfosWriter(directory, segment, fieldInfos);
251 skipInterval = termInfosWriter.skipInterval;
252 queue = new SegmentMergeQueue(readers.size());
253
254 mergeTermInfos();
255
256 } finally {
257 if (freqOutput != null) freqOutput.close();
258 if (proxOutput != null) proxOutput.close();
259 if (termInfosWriter != null) termInfosWriter.close();
260 if (queue != null) queue.close();
261 }
262 }
263
264 private final void mergeTermInfos() throws IOException {
265 int base = 0;
266 for (int i = 0; i < readers.size(); i++) {
267 IndexReader reader = (IndexReader) readers.elementAt(i);
268 TermEnum termEnum = reader.terms();
269 SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
270 base += reader.numDocs();
271 if (smi.next())
272 queue.put(smi); // initialize queue
273 else
274 smi.close();
275 }
276
277 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
278
279 while (queue.size() > 0) {
280 int matchSize = 0; // pop matching terms
281 match[matchSize++] = (SegmentMergeInfo) queue.pop();
282 Term term = match[0].term;
283 SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
284
285 while (top != null && term.compareTo(top.term) == 0) {
286 match[matchSize++] = (SegmentMergeInfo) queue.pop();
287 top = (SegmentMergeInfo) queue.top();
288 }
289
290 mergeTermInfo(match, matchSize); // add new TermInfo
291
292 while (matchSize > 0) {
293 SegmentMergeInfo smi = match[--matchSize];
294 if (smi.next())
295 queue.put(smi); // restore queue
296 else
297 smi.close(); // done with a segment
298 }
299 }
300 }
301
302 private final TermInfo termInfo = new TermInfo(); // minimize consing
303
304 /** Merge one term found in one or more segments. The array <code>smis</code>
305 * contains segments that are positioned at the same term. <code>N</code>
306 * is the number of cells in the array actually occupied.
307 *
308 * @param smis array of segments
309 * @param n number of cells in the array actually occupied
310 */
311 private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
312 throws IOException {
313 long freqPointer = freqOutput.getFilePointer();
314 long proxPointer = proxOutput.getFilePointer();
315
316 int df = appendPostings(smis, n); // append posting data
317
318 long skipPointer = writeSkip();
319
320 if (df > 0) {
321 // add an entry to the dictionary with pointers to prox and freq files
322 termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
323 termInfosWriter.add(smis[0].term, termInfo);
324 }
325 }
326
327 /** Process postings from multiple segments all positioned on the
328 * same term. Writes out merged entries into freqOutput and
329 * the proxOutput streams.
330 *
331 * @param smis array of segments
332 * @param n number of cells in the array actually occupied
333 * @return number of documents across all segments where this term was found
334 */
335 private final int appendPostings(SegmentMergeInfo[] smis, int n)
336 throws IOException {
337 int lastDoc = 0;
338 int df = 0; // number of docs w/ term
339 resetSkip();
340 for (int i = 0; i < n; i++) {
341 SegmentMergeInfo smi = smis[i];
342 TermPositions postings = smi.postings;
343 int base = smi.base;
344 int[] docMap = smi.docMap;
345 postings.seek(smi.termEnum);
346 while (postings.next()) {
347 int doc = postings.doc();
348 if (docMap != null)
349 doc = docMap[doc]; // map around deletions
350 doc += base; // convert to merged space
351
352 if (doc < lastDoc)
353 throw new IllegalStateException("docs out of order");
354
355 df++;
356
357 if ((df % skipInterval) == 0) {
358 bufferSkip(lastDoc);
359 }
360
361 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
362 lastDoc = doc;
363
364 int freq = postings.freq();
365 if (freq == 1) {
366 freqOutput.writeVInt(docCode | 1); // write doc & freq=1
367 } else {
368 freqOutput.writeVInt(docCode); // write doc
369 freqOutput.writeVInt(freq); // write frequency in doc
370 }
371
372 int lastPosition = 0; // write position deltas
373 for (int j = 0; j < freq; j++) {
374 int position = postings.nextPosition();
375 proxOutput.writeVInt(position - lastPosition);
376 lastPosition = position;
377 }
378 }
379 }
380 return df;
381 }
382
383 private RAMOutputStream skipBuffer = new RAMOutputStream();
384 private int lastSkipDoc;
385 private long lastSkipFreqPointer;
386 private long lastSkipProxPointer;
387
388 private void resetSkip() throws IOException {
389 skipBuffer.reset();
390 lastSkipDoc = 0;
391 lastSkipFreqPointer = freqOutput.getFilePointer();
392 lastSkipProxPointer = proxOutput.getFilePointer();
393 }
394
395 private void bufferSkip(int doc) throws IOException {
396 long freqPointer = freqOutput.getFilePointer();
397 long proxPointer = proxOutput.getFilePointer();
398
399 skipBuffer.writeVInt(doc - lastSkipDoc);
400 skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
401 skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
402
403 lastSkipDoc = doc;
404 lastSkipFreqPointer = freqPointer;
405 lastSkipProxPointer = proxPointer;
406 }
407
408 private long writeSkip() throws IOException {
409 long skipPointer = freqOutput.getFilePointer();
410 skipBuffer.writeTo(freqOutput);
411 return skipPointer;
412 }
413
414 private void mergeNorms() throws IOException {
415 for (int i = 0; i < fieldInfos.size(); i++) {
416 FieldInfo fi = fieldInfos.fieldInfo(i);
417 if (fi.isIndexed) {
418 OutputStream output = directory.createFile(segment + ".f" + i);
419 try {
420 for (int j = 0; j < readers.size(); j++) {
421 IndexReader reader = (IndexReader) readers.elementAt(j);
422 byte[] input = reader.norms(fi.name);
423 int maxDoc = reader.maxDoc();
424 for (int k = 0; k < maxDoc; k++) {
425 byte norm = input != null ? input[k] : (byte) 0;
426 if (!reader.isDeleted(k)) {
427 output.writeByte(norm);
428 }
429 }
430 }
431 } finally {
432 output.close();
433 }
434 }
435 }
436 }
437
438 }
|