001 package gate.creole.annic.apache.lucene.index;
002
003 import gate.creole.annic.apache.lucene.store.Directory;
004 import gate.creole.annic.apache.lucene.store.OutputStream;
005 import gate.creole.annic.apache.lucene.util.StringHelper;
006
007 import java.io.IOException;
008 import java.util.Vector;
009
010 /**
011 * Writer works by opening a document and then opening the fields within the document and then
012 * writing out the vectors for each field.
013 *
014 * Rough usage:
015 *
016 <CODE>
017 for each document
018 {
019 writer.openDocument();
020 for each field on the document
021 {
022 writer.openField(field);
023 for all of the terms
024 {
025 writer.addTerm(...)
026 }
027 writer.closeField
028 }
029 writer.closeDocument()
030 }
031 </CODE>
032 */
033 final class TermVectorsWriter {
034 public static final int FORMAT_VERSION = 1;
035 //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
036 public static final int FORMAT_SIZE = 4;
037
038 //TODO: Figure out how to write with or w/o position information and read back in
039 public static final String TVX_EXTENSION = ".tvx";
040 public static final String TVD_EXTENSION = ".tvd";
041 public static final String TVF_EXTENSION = ".tvf";
042 private OutputStream tvx = null, tvd = null, tvf = null;
043 private Vector fields = null;
044 private Vector terms = null;
045 private FieldInfos fieldInfos;
046
047 private TVField currentField = null;
048 private long currentDocPointer = -1;
049
050 /** Create term vectors writer for the specified segment in specified
051 * directory. A new TermVectorsWriter should be created for each
052 * segment. The parameter <code>maxFields</code> indicates how many total
053 * fields are found in this document. Not all of these fields may require
054 * termvectors to be stored, so the number of calls to
055 * <code>openField</code> is less or equal to this number.
056 */
057 public TermVectorsWriter(Directory directory, String segment,
058 FieldInfos fieldInfos)
059 throws IOException {
060 // Open files for TermVector storage
061 tvx = directory.createFile(segment + TVX_EXTENSION);
062 tvx.writeInt(FORMAT_VERSION);
063 tvd = directory.createFile(segment + TVD_EXTENSION);
064 tvd.writeInt(FORMAT_VERSION);
065 tvf = directory.createFile(segment + TVF_EXTENSION);
066 tvf.writeInt(FORMAT_VERSION);
067
068 this.fieldInfos = fieldInfos;
069 fields = new Vector(fieldInfos.size());
070 terms = new Vector();
071 }
072
073
074 public final void openDocument()
075 throws IOException {
076 closeDocument();
077
078 currentDocPointer = tvd.getFilePointer();
079 }
080
081
082 public final void closeDocument()
083 throws IOException {
084 if (isDocumentOpen()) {
085 closeField();
086 writeDoc();
087 fields.clear();
088 currentDocPointer = -1;
089 }
090 }
091
092
093 public final boolean isDocumentOpen() {
094 return currentDocPointer != -1;
095 }
096
097
098 /** Start processing a field. This can be followed by a number of calls to
099 * addTerm, and a final call to closeField to indicate the end of
100 * processing of this field. If a field was previously open, it is
101 * closed automatically.
102 */
103 public final void openField(String field)
104 throws IOException {
105 if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
106
107 closeField();
108 currentField = new TVField(fieldInfos.fieldNumber(field));
109 }
110
111 /** Finished processing current field. This should be followed by a call to
112 * openField before future calls to addTerm.
113 */
114 public final void closeField()
115 throws IOException {
116 if (isFieldOpen()) {
117 /* DEBUG */
118 //System.out.println("closeField()");
119 /* DEBUG */
120
121 // save field and terms
122 writeField();
123 fields.add(currentField);
124 terms.clear();
125 currentField = null;
126 }
127 }
128
129 /** Return true if a field is currently open. */
130 public final boolean isFieldOpen() {
131 return currentField != null;
132 }
133
134 /** Add term to the field's term vector. Field must already be open
135 * of NullPointerException is thrown. Terms should be added in
136 * increasing order of terms, one call per unique termNum. ProxPointer
137 * is a pointer into the TermPosition file (prx). Freq is the number of
138 * times this term appears in this field, in this document.
139 */
140 public final void addTerm(String termText, int freq) {
141 if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
142 if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
143
144 addTermInternal(termText, freq);
145 }
146
147 private final void addTermInternal(String termText, int freq) {
148 currentField.length += freq;
149 TVTerm term = new TVTerm();
150 term.termText = termText;
151 term.freq = freq;
152 terms.add(term);
153 }
154
155
156 /** Add specified vectors to the document.
157 */
158 public final void addVectors(TermFreqVector[] vectors)
159 throws IOException {
160 if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
161 if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
162
163 for (int i = 0; i < vectors.length; i++) {
164 addTermFreqVector(vectors[i]);
165 }
166 }
167
168
169 /** Add specified vector to the document. Document must be open but no field
170 * should be open or exception is thrown. The same document can have <code>addTerm</code>
171 * and <code>addVectors</code> calls mixed, however a given field must either be
172 * populated with <code>addTerm</code> or with <code>addVector</code>. *
173 */
174 public final void addTermFreqVector(TermFreqVector vector)
175 throws IOException {
176 if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
177 if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
178 addTermFreqVectorInternal(vector);
179 }
180
181 private final void addTermFreqVectorInternal(TermFreqVector vector)
182 throws IOException {
183 openField(vector.getField());
184 for (int i = 0; i < vector.size(); i++) {
185 addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
186 }
187 closeField();
188 }
189
190
191
192
193 /** Close all streams. */
194 final void close() throws IOException {
195 try {
196 closeDocument();
197 } finally {
198 // make an effort to close all streams we can but remember and re-throw
199 // the first exception encountered in this process
200 IOException keep = null;
201 if (tvx != null)
202 try {
203 tvx.close();
204 } catch (IOException e) {
205 if (keep == null) keep = e;
206 }
207 if (tvd != null)
208 try {
209 tvd.close();
210 } catch (IOException e) {
211 if (keep == null) keep = e;
212 }
213 if (tvf != null)
214 try {
215 tvf.close();
216 } catch (IOException e) {
217 if (keep == null) keep = e;
218 }
219 if (keep != null) throw (IOException) keep.fillInStackTrace();
220 }
221 }
222
223
224
225 private void writeField() throws IOException {
226 // remember where this field is written
227 currentField.tvfPointer = tvf.getFilePointer();
228 //System.out.println("Field Pointer: " + currentField.tvfPointer);
229 final int size;
230
231 tvf.writeVInt(size = terms.size());
232 tvf.writeVInt(currentField.length - size);
233 String lastTermText = "";
234 // write term ids and positions
235 for (int i = 0; i < size; i++) {
236 TVTerm term = (TVTerm) terms.elementAt(i);
237 //tvf.writeString(term.termText);
238 int start = StringHelper.stringDifference(lastTermText, term.termText);
239 int length = term.termText.length() - start;
240 tvf.writeVInt(start); // write shared prefix length
241 tvf.writeVInt(length); // write delta length
242 tvf.writeChars(term.termText, start, length); // write delta chars
243 tvf.writeVInt(term.freq);
244 lastTermText = term.termText;
245 }
246 }
247
248
249
250
251 private void writeDoc() throws IOException {
252 if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");
253 //System.out.println("Writing doc pointer: " + currentDocPointer);
254 // write document index record
255 tvx.writeLong(currentDocPointer);
256
257 // write document data record
258 final int size;
259
260 // write the number of fields
261 tvd.writeVInt(size = fields.size());
262
263 // write field numbers
264 int lastFieldNumber = 0;
265 for (int i = 0; i < size; i++) {
266 TVField field = (TVField) fields.elementAt(i);
267 tvd.writeVInt(field.number - lastFieldNumber);
268
269 lastFieldNumber = field.number;
270 }
271
272 // write field pointers
273 long lastFieldPointer = 0;
274 for (int i = 0; i < size; i++) {
275 TVField field = (TVField) fields.elementAt(i);
276 tvd.writeVLong(field.tvfPointer - lastFieldPointer);
277
278 lastFieldPointer = field.tvfPointer;
279 }
280 //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
281 }
282
283
284 private static class TVField {
285 int number;
286 long tvfPointer = 0;
287 int length = 0; // number of distinct term positions
288
289 TVField(int number) {
290 this.number = number;
291 }
292 }
293
294 private static class TVTerm {
295 String termText;
296 /* Niraj */
297 String type;
298 int position;
299 /* End */
300 int freq = 0;
301 //int positions[] = null;
302 }
303
304
305 }
|