001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019
020 import java.io.IOException;
021 import gate.creole.annic.apache.lucene.store.OutputStream;
022 import gate.creole.annic.apache.lucene.store.Directory;
023 import gate.creole.annic.apache.lucene.util.StringHelper;
024
025 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
026 Directory. A TermInfos can be written once, in order. */
027
028 final class TermInfosWriter {
029 /** The file format version, a negative number. */
030 public static final int FORMAT = -2;
031
032 private FieldInfos fieldInfos;
033 private OutputStream output;
034 private Term lastTerm = new Term("", "", ""/*, 0*/);
035 private TermInfo lastTi = new TermInfo();
036 private long size = 0;
037
038 // TODO: the default values for these two parameters should be settable from
039 // IndexWriter. However, once that's done, folks will start setting them to
040 // ridiculous values and complaining that things don't work well, as with
041 // mergeFactor. So, let's wait until a number of folks find that alternate
042 // values work better. Note that both of these values are stored in the
043 // segment, so that it's safe to change these w/o rebuilding all indexes.
044
045 /** Expert: The fraction of terms in the "dictionary" which should be stored
046 * in RAM. Smaller values use more memory, but make searching slightly
047 * faster, while larger values use less memory and make searching slightly
048 * slower. Searching is typically not dominated by dictionary lookup, so
049 * tweaking this is rarely useful.*/
050 int indexInterval = 128;
051
052 /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
053 * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in
054 * smaller indexes, greater acceleration, but fewer accelerable cases, while
055 * smaller values result in bigger indexes, less acceleration and more
056 * accelerable cases. More detailed experiments would be useful here. */
057 int skipInterval = 16;
058
059 private long lastIndexPointer = 0;
060 private boolean isIndex = false;
061
062 private TermInfosWriter other = null;
063
064 TermInfosWriter(Directory directory, String segment, FieldInfos fis)
065 throws IOException {
066 initialize(directory, segment, fis, false);
067 other = new TermInfosWriter(directory, segment, fis, true);
068 other.other = this;
069 }
070
071 private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
072 boolean isIndex) throws IOException {
073 initialize(directory, segment, fis, isIndex);
074 }
075
076 private void initialize(Directory directory, String segment, FieldInfos fis,
077 boolean isi) throws IOException {
078 fieldInfos = fis;
079 isIndex = isi;
080 output = directory.createFile(segment + (isIndex ? ".tii" : ".tis"));
081 output.writeInt(FORMAT); // write format
082 output.writeLong(0); // leave space for size
083 output.writeInt(indexInterval); // write indexInterval
084 output.writeInt(skipInterval); // write skipInterval
085 }
086
087 /** Adds a new <Term, TermInfo> pair to the set.
088 Term must be lexicographically greater than all previous Terms added.
089 TermInfo pointers must be positive and greater than all previous.*/
090 final void add(Term term, TermInfo ti)
091 throws IOException {
092 int compareResult = term.compareTo(lastTerm);
093 if (!isIndex && compareResult <= 0) {
094 throw new IOException("term out of order");
095 }
096 if (ti.freqPointer < lastTi.freqPointer)
097 throw new IOException("freqPointer out of order");
098 if (ti.proxPointer < lastTi.proxPointer)
099 throw new IOException("proxPointer out of order");
100
101 if (!isIndex && size % indexInterval == 0)
102 other.add(lastTerm, lastTi); // add an index term
103
104 writeTerm(term); // write term
105 output.writeVInt(ti.docFreq); // write doc freq
106 output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
107 output.writeVLong(ti.proxPointer - lastTi.proxPointer);
108
109 if (ti.docFreq >= skipInterval) {
110 output.writeVInt(ti.skipOffset);
111 }
112
113 if (isIndex) {
114 output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
115 lastIndexPointer = other.output.getFilePointer(); // write pointer
116 }
117
118 lastTi.set(ti);
119 size++;
120 }
121
122 private final void writeTerm(Term term)
123 throws IOException {
124 //int start = StringHelper.stringDifference(lastTerm.text, term.text);
125 //int length = term.text.length() - start;
126 //int start = 0;
127 int length = term.text.length();
128
129 //output.writeVInt(start); // write shared prefix length
130 output.writeVInt(length); // write delta length
131 output.writeChars(term.text, /*start*/0, length); // write delta chars
132 /* Niraj */
133 if(term.type == null)
134 term.type = "word";
135
136 //start = StringHelper.stringDifference(lastTerm.type, term.type);
137 length = term.type.length();
138 output.writeVInt(length);
139 output.writeChars(term.type, 0, length);
140 /*output.writeVInt(term.position);*/
141 /* End*/
142 output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num
143 //System.out.println("Term Written");
144 lastTerm = term;
145 }
146
147
148
149 /** Called to complete TermInfos creation. */
150 final void close() throws IOException {
151 output.seek(4); // write size after format
152 output.writeLong(size);
153 output.close();
154
155 if (!isIndex)
156 other.close();
157 }
158
159 }
|