001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020 import java.util.Collection;
021 import java.util.Enumeration;
022 import java.util.HashSet;
023 import java.util.Hashtable;
024 import java.util.Set;
025 import java.util.Vector;
026
027 import gate.creole.annic.apache.lucene.document.Document;
028 import gate.creole.annic.apache.lucene.store.InputStream;
029 import gate.creole.annic.apache.lucene.store.OutputStream;
030 import gate.creole.annic.apache.lucene.store.Lock;
031 import gate.creole.annic.apache.lucene.store.Directory;
032 import gate.creole.annic.apache.lucene.util.BitVector;
033
034 /**
035 * FIXME: Describe class <code>SegmentReader</code> here.
036 *
037 * @version $Id: SegmentReader.java 529 2004-10-05 11:55:26Z niraj $
038 */
039 final class SegmentReader extends IndexReader {
040 private String segment;
041
042 FieldInfos fieldInfos;
043 private FieldsReader fieldsReader;
044
045 TermInfosReader tis;
046 TermVectorsReader termVectorsReader;
047
048 BitVector deletedDocs = null;
049 private boolean deletedDocsDirty = false;
050 private boolean normsDirty = false;
051 private boolean undeleteAll = false;
052
053 InputStream freqStream;
054 InputStream proxStream;
055
056 // Compound File Reader when based on a compound file segment
057 CompoundFileReader cfsReader;
058
059 private class Norm {
060 public Norm(InputStream in, int number)
061 {
062 this.in = in;
063 this.number = number;
064 }
065
066 private InputStream in;
067 private byte[] bytes;
068 private boolean dirty;
069 private int number;
070
071 private void reWrite() throws IOException {
072 // NOTE: norms are re-written in regular directory, not cfs
073 OutputStream out = directory().createFile(segment + ".tmp");
074 try {
075 out.writeBytes(bytes, maxDoc());
076 } finally {
077 out.close();
078 }
079 String fileName = segment + ".f" + number;
080 directory().renameFile(segment + ".tmp", fileName);
081 this.dirty = false;
082 }
083 }
084
085 private Hashtable norms = new Hashtable();
086
087 SegmentReader(SegmentInfos sis, SegmentInfo si, boolean closeDir)
088 throws IOException {
089 super(si.dir, sis, closeDir);
090 initialize(si);
091 }
092
093 SegmentReader(SegmentInfo si) throws IOException {
094 super(si.dir);
095 initialize(si);
096 }
097
098 private void initialize(SegmentInfo si) throws IOException
099 {
100 segment = si.name;
101
102 // Use compound file directory for some files, if it exists
103 Directory cfsDir = directory();
104 if (directory().fileExists(segment + ".cfs")) {
105 cfsReader = new CompoundFileReader(directory(), segment + ".cfs");
106 cfsDir = cfsReader;
107 }
108
109 // No compound file exists - use the multi-file format
110 fieldInfos = new FieldInfos(cfsDir, segment + ".fnm");
111 fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos);
112
113 tis = new TermInfosReader(cfsDir, segment, fieldInfos);
114
115 // NOTE: the bitvector is stored using the regular directory, not cfs
116 if (hasDeletions(si))
117 deletedDocs = new BitVector(directory(), segment + ".del");
118
119 // make sure that all index files have been read or are kept open
120 // so that if an index update removes them we'll still have them
121 freqStream = cfsDir.openFile(segment + ".frq");
122 proxStream = cfsDir.openFile(segment + ".prx");
123 openNorms(cfsDir);
124
125 if (fieldInfos.hasVectors()) { // open term vector files only as needed
126 termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos);
127 }
128 }
129
130 protected final void doCommit() throws IOException {
131 if (deletedDocsDirty) { // re-write deleted
132 deletedDocs.write(directory(), segment + ".tmp");
133 directory().renameFile(segment + ".tmp", segment + ".del");
134 }
135 if(undeleteAll && directory().fileExists(segment + ".del")){
136 directory().deleteFile(segment + ".del");
137 }
138 if (normsDirty) { // re-write norms
139 Enumeration values = norms.elements();
140 while (values.hasMoreElements()) {
141 Norm norm = (Norm) values.nextElement();
142 if (norm.dirty) {
143 norm.reWrite();
144 }
145 }
146 }
147 deletedDocsDirty = false;
148 normsDirty = false;
149 undeleteAll = false;
150 }
151
152 protected final void doClose() throws IOException {
153 fieldsReader.close();
154 tis.close();
155
156 if (freqStream != null)
157 freqStream.close();
158 if (proxStream != null)
159 proxStream.close();
160
161 closeNorms();
162 if (termVectorsReader != null) termVectorsReader.close();
163
164 if (cfsReader != null)
165 cfsReader.close();
166 }
167
168 static final boolean hasDeletions(SegmentInfo si) throws IOException {
169 return si.dir.fileExists(si.name + ".del");
170 }
171
172 public boolean hasDeletions() {
173 return deletedDocs != null;
174 }
175
176
177 static final boolean usesCompoundFile(SegmentInfo si) throws IOException {
178 return si.dir.fileExists(si.name + ".cfs");
179 }
180
181 static final boolean hasSeparateNorms(SegmentInfo si) throws IOException {
182 String[] result = si.dir.list();
183 String pattern = si.name + ".f";
184 int patternLength = pattern.length();
185 for(int i = 0; i < 0; i++){
186 if(result[i].startsWith(pattern) && Character.isDigit(result[i].charAt(patternLength)))
187 return true;
188 }
189 return false;
190 }
191
192 protected final void doDelete(int docNum) throws IOException {
193 if (deletedDocs == null)
194 deletedDocs = new BitVector(maxDoc());
195 deletedDocsDirty = true;
196 undeleteAll = false;
197 deletedDocs.set(docNum);
198 }
199
200 protected final void doUndeleteAll() throws IOException {
201 deletedDocs = null;
202 deletedDocsDirty = false;
203 undeleteAll = true;
204 }
205
206 final Vector files() throws IOException {
207 Vector files = new Vector(16);
208 final String ext[] = new String[]{
209 "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
210 "tvx", "tvd", "tvf", "tvp" };
211
212 for (int i = 0; i < ext.length; i++) {
213 String name = segment + "." + ext[i];
214 if (directory().fileExists(name))
215 files.addElement(name);
216 }
217
218 for (int i = 0; i < fieldInfos.size(); i++) {
219 FieldInfo fi = fieldInfos.fieldInfo(i);
220 if (fi.isIndexed)
221 files.addElement(segment + ".f" + i);
222 }
223 return files;
224 }
225
226 public final TermEnum terms() throws IOException {
227 return tis.terms();
228 }
229
230 public final TermEnum terms(Term t) throws IOException {
231 return tis.terms(t);
232 }
233
234 public final synchronized Document document(int n) throws IOException {
235 if (isDeleted(n))
236 throw new IllegalArgumentException
237 ("attempt to access a deleted document");
238 return fieldsReader.doc(n);
239 }
240
241 public final synchronized boolean isDeleted(int n) {
242 return (deletedDocs != null && deletedDocs.get(n));
243 }
244
245 public final TermDocs termDocs() throws IOException {
246 return new SegmentTermDocs(this);
247 }
248
249 public final TermPositions termPositions() throws IOException {
250 return new SegmentTermPositions(this);
251 }
252
253 public final int docFreq(Term t) throws IOException {
254 TermInfo ti = tis.get(t);
255 if (ti != null)
256 return ti.docFreq;
257 else
258 return 0;
259 }
260
261 public final int numDocs() {
262 int n = maxDoc();
263 if (deletedDocs != null)
264 n -= deletedDocs.count();
265 return n;
266 }
267
268 public final int maxDoc() {
269 return fieldsReader.size();
270 }
271
272 /**
273 * @see IndexReader#getFieldNames()
274 */
275 public Collection getFieldNames() throws IOException {
276 // maintain a unique set of field names
277 Set fieldSet = new HashSet();
278 for (int i = 0; i < fieldInfos.size(); i++) {
279 FieldInfo fi = fieldInfos.fieldInfo(i);
280 fieldSet.add(fi.name);
281 }
282 return fieldSet;
283 }
284
285 /**
286 * @see IndexReader#getFieldNames(boolean)
287 */
288 public Collection getFieldNames(boolean indexed) throws IOException {
289 // maintain a unique set of field names
290 Set fieldSet = new HashSet();
291 for (int i = 0; i < fieldInfos.size(); i++) {
292 FieldInfo fi = fieldInfos.fieldInfo(i);
293 if (fi.isIndexed == indexed)
294 fieldSet.add(fi.name);
295 }
296 return fieldSet;
297 }
298
299 /**
300 *
301 * @param storedTermVector if true, returns only Indexed fields that have term vector info,
302 * else only indexed fields without term vector info
303 * @return Collection of Strings indicating the names of the fields
304 */
305 public Collection getIndexedFieldNames(boolean storedTermVector) {
306 // maintain a unique set of field names
307 Set fieldSet = new HashSet();
308 for (int i = 0; i < fieldInfos.size(); i++) {
309 FieldInfo fi = fieldInfos.fieldInfo(i);
310 if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){
311 fieldSet.add(fi.name);
312 }
313 }
314 return fieldSet;
315
316 }
317
318 public synchronized byte[] norms(String field) throws IOException {
319 Norm norm = (Norm) norms.get(field);
320 if (norm == null) // not an indexed field
321 return null;
322 if (norm.bytes == null) { // value not yet read
323 byte[] bytes = new byte[maxDoc()];
324 norms(field, bytes, 0);
325 norm.bytes = bytes; // cache it
326 }
327 return norm.bytes;
328 }
329
330 protected final void doSetNorm(int doc, String field, byte value)
331 throws IOException {
332 Norm norm = (Norm) norms.get(field);
333 if (norm == null) // not an indexed field
334 return;
335 norm.dirty = true; // mark it dirty
336 normsDirty = true;
337
338 norms(field)[doc] = value; // set the value
339 }
340
341 /** Read norms into a pre-allocated array. */
342 public synchronized void norms(String field, byte[] bytes, int offset)
343 throws IOException {
344
345 Norm norm = (Norm) norms.get(field);
346 if (norm == null)
347 return; // use zeros in array
348
349 if (norm.bytes != null) { // can copy from cache
350 System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
351 return;
352 }
353
354 InputStream normStream = (InputStream) norm.in.clone();
355 try { // read from disk
356 normStream.seek(0);
357 normStream.readBytes(bytes, offset, maxDoc());
358 } finally {
359 normStream.close();
360 }
361 }
362
363 private final void openNorms(Directory cfsDir) throws IOException {
364 for (int i = 0; i < fieldInfos.size(); i++) {
365 FieldInfo fi = fieldInfos.fieldInfo(i);
366 if (fi.isIndexed) {
367 String fileName = segment + ".f" + fi.number;
368 // look first for re-written file, then in compound format
369 Directory d = directory().fileExists(fileName) ? directory() : cfsDir;
370 norms.put(fi.name, new Norm(d.openFile(fileName), fi.number));
371 }
372 }
373 }
374
375 private final void closeNorms() throws IOException {
376 synchronized (norms) {
377 Enumeration enumerator = norms.elements();
378 while (enumerator.hasMoreElements()) {
379 Norm norm = (Norm) enumerator.nextElement();
380 norm.in.close();
381 }
382 }
383 }
384
385 /** Return a term frequency vector for the specified document and field. The
386 * vector returned contains term numbers and frequencies for all terms in
387 * the specified field of this document, if the field had storeTermVector
388 * flag set. If the flag was not set, the method returns null.
389 */
390 public TermFreqVector getTermFreqVector(int docNumber, String field)
391 throws IOException {
392 // Check if this field is invalid or has no stored term vector
393 FieldInfo fi = fieldInfos.fieldInfo(field);
394 if (fi == null || !fi.storeTermVector) return null;
395
396 return termVectorsReader.get(docNumber, field);
397 }
398
399
400 /** Return an array of term frequency vectors for the specified document.
401 * The array contains a vector for each vectorized field in the document.
402 * Each vector vector contains term numbers and frequencies for all terms
403 * in a given vectorized field.
404 * If no such fields existed, the method returns null.
405 */
406 public TermFreqVector[] getTermFreqVectors(int docNumber)
407 throws IOException {
408 if (termVectorsReader == null)
409 return null;
410
411 return termVectorsReader.get(docNumber);
412 }
413 }
|