001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import java.io.IOException;
020 import java.io.Reader;
021 import java.io.StringReader;
022 import java.util.Hashtable;
023 import java.util.Enumeration;
024 import java.util.Arrays;
025
026 import gate.creole.annic.apache.lucene.document.Document;
027 import gate.creole.annic.apache.lucene.document.Field;
028 import gate.creole.annic.apache.lucene.analysis.Analyzer;
029 import gate.creole.annic.apache.lucene.analysis.TokenStream;
030 import gate.creole.annic.apache.lucene.analysis.Token;
031 import gate.creole.annic.apache.lucene.store.Directory;
032 import gate.creole.annic.apache.lucene.store.OutputStream;
033 import gate.creole.annic.apache.lucene.search.Similarity;
034
035 final class DocumentWriter {
036 private Analyzer analyzer;
037 private Directory directory;
038 private Similarity similarity;
039 private FieldInfos fieldInfos;
040 private int maxFieldLength;
041
042 /**
043 *
044 * @param directory The directory to write the document information to
045 * @param analyzer The analyzer to use for the document
046 * @param similarity The Similarity function
047 * @param maxFieldLength The maximum number of tokens a field may have
048 */
049 DocumentWriter(Directory directory, Analyzer analyzer,
050 Similarity similarity, int maxFieldLength) {
051 this.directory = directory;
052 this.analyzer = analyzer;
053 this.similarity = similarity;
054 this.maxFieldLength = maxFieldLength;
055 }
056
057 final void addDocument(String segment, Document doc)
058 throws IOException {
059 // write field names
060 fieldInfos = new FieldInfos();
061 fieldInfos.add(doc);
062 fieldInfos.write(directory, segment + ".fnm");
063
064 // write field values
065 FieldsWriter fieldsWriter =
066 new FieldsWriter(directory, segment, fieldInfos);
067 try {
068 fieldsWriter.addDocument(doc);
069 } finally {
070 fieldsWriter.close();
071 }
072
073 // invert doc into postingTable
074 postingTable.clear(); // clear postingTable
075 fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
076 fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
077
078 fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
079 Arrays.fill(fieldBoosts, doc.getBoost());
080
081 invertDocument(doc);
082
083 // sort postingTable into an array
084 Posting[] postings = sortPostingTable();
085
086 /*
087 for (int i = 0; i < postings.length; i++) {
088 Posting posting = postings[i];
089 System.out.print(posting.term);
090 System.out.print(" freq=" + posting.freq);
091 System.out.print(" pos=");
092 System.out.print(posting.positions[0]);
093 for (int j = 1; j < posting.freq; j++)
094 System.out.print("," + posting.positions[j]);
095 System.out.println("");
096 }
097 */
098
099 // write postings
100 writePostings(postings, segment);
101
102 // write norms of indexed fields
103 writeNorms(doc, segment);
104
105 }
106
107 // Keys are Terms, values are Postings.
108 // Used to buffer a document before it is written to the index.
109 private final Hashtable postingTable = new Hashtable();
110 private int[] fieldLengths;
111 private int[] fieldPositions;
112 private float[] fieldBoosts;
113
114 // Tokenizes the fields of a document into Postings.
115 private final void invertDocument(Document doc)
116 throws IOException {
117 Enumeration fields = doc.fields();
118 while (fields.hasMoreElements()) {
119 Field field = (Field) fields.nextElement();
120 String fieldName = field.name();
121 int fieldNumber = fieldInfos.fieldNumber(fieldName);
122
123 int length = fieldLengths[fieldNumber]; // length of field
124 int position = fieldPositions[fieldNumber]; // position in field
125
126 if (field.isIndexed()) {
127 if (!field.isTokenized()) { // un-tokenized field
128 addPosition(fieldName, field.stringValue(), "Field"/*, 1*/, position++);
129 length++;
130 } else {
131 Reader reader; // find or make Reader
132 if (field.readerValue() != null)
133 reader = field.readerValue();
134 else if (field.stringValue() != null)
135 reader = new StringReader(field.stringValue());
136 else
137 throw new IllegalArgumentException
138 ("field must have either String or Reader value");
139
140 // Tokenize field and add to postingTable
141 TokenStream stream = analyzer.tokenStream(fieldName, reader);
142 try {
143 for (Token t = stream.next(); t != null; t = stream.next()) {
144 position += (t.getPositionIncrement() - 1);
145 if(t.type() == null)
146 addPosition(fieldName, t.termText(), "*"/*, t.getPositionIncrement()*/, position++);
147 else
148 addPosition(fieldName, t.termText(), t.type()/*, t.getPositionIncrement()*/, position++);
149 if (++length > maxFieldLength) break;
150 }
151 } finally {
152 stream.close();
153 }
154 }
155
156 fieldLengths[fieldNumber] = length; // save field length
157 fieldPositions[fieldNumber] = position; // save field position
158 fieldBoosts[fieldNumber] *= field.getBoost();
159 }
160 }
161 }
162
163 private final Term termBuffer = new Term("", "", "")/*, 0)*/; // avoid consing
164
165 private final void addPosition(String field, String text, String type/*, int posIncrement*/, int position) {
166
167 termBuffer.set(field, text, type/*, posIncrement*/);
168
169 Posting ti = (Posting) postingTable.get(termBuffer);
170 if (ti != null) { // word seen before
171 int freq = ti.freq;
172 if (ti.positions.length == freq) { // positions array is full
173 int[] newPositions = new int[freq * 2]; // double size
174 int[] positions = ti.positions;
175 for (int i = 0; i < freq; i++) // copy old positions to new
176 newPositions[i] = positions[i];
177 ti.positions = newPositions;
178 }
179 ti.positions[freq] = position; // add new position
180 ti.freq = freq + 1; // update frequency
181 } else { // word not seen before
182 Term term = new Term(field, text, type/*, posIncrement*/, false);
183 postingTable.put(term, new Posting(term, position));
184 }
185 }
186
187 private final Posting[] sortPostingTable() {
188 // copy postingTable into an array
189 Posting[] array = new Posting[postingTable.size()];
190 Enumeration postings = postingTable.elements();
191 for (int i = 0; postings.hasMoreElements(); i++)
192 array[i] = (Posting) postings.nextElement();
193
194 // sort the array
195 quickSort(array, 0, array.length - 1);
196
197 return array;
198 }
199
200 private static final void quickSort(Posting[] postings, int lo, int hi) {
201 if (lo >= hi)
202 return;
203
204 int mid = (lo + hi) >>> 1;
205
206 if (postings[lo].term.compareTo(postings[mid].term) > 0) {
207 Posting tmp = postings[lo];
208 postings[lo] = postings[mid];
209 postings[mid] = tmp;
210 }
211
212 if (postings[mid].term.compareTo(postings[hi].term) > 0) {
213 Posting tmp = postings[mid];
214 postings[mid] = postings[hi];
215 postings[hi] = tmp;
216
217 if (postings[lo].term.compareTo(postings[mid].term) > 0) {
218 Posting tmp2 = postings[lo];
219 postings[lo] = postings[mid];
220 postings[mid] = tmp2;
221 }
222 }
223
224 int left = lo + 1;
225 int right = hi - 1;
226
227 if (left >= right)
228 return;
229
230 Term partition = postings[mid].term;
231
232 for (; ;) {
233 while (postings[right].term.compareTo(partition) > 0)
234 --right;
235
236 while (left < right && postings[left].term.compareTo(partition) <= 0)
237 ++left;
238
239 if (left < right) {
240 Posting tmp = postings[left];
241 postings[left] = postings[right];
242 postings[right] = tmp;
243 --right;
244 } else {
245 break;
246 }
247 }
248
249 quickSort(postings, lo, left);
250 quickSort(postings, left + 1, hi);
251 }
252
253 private final void writePostings(Posting[] postings, String segment)
254 throws IOException {
255 OutputStream freq = null, prox = null;
256 TermInfosWriter tis = null;
257 TermVectorsWriter termVectorWriter = null;
258 try {
259 //open files for inverse index storage
260 freq = directory.createFile(segment + ".frq");
261 prox = directory.createFile(segment + ".prx");
262 tis = new TermInfosWriter(directory, segment, fieldInfos);
263 TermInfo ti = new TermInfo();
264 String currentField = null;
265 for (int i = 0; i < postings.length; i++) {
266 Posting posting = postings[i];
267
268 // add an entry to the dictionary with pointers to prox and freq files
269 ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
270 //System.out.println("Adding term");
271 tis.add(posting.term, ti);
272
273 // add an entry to the freq file
274 int postingFreq = posting.freq;
275 if (postingFreq == 1) // optimize freq=1
276 freq.writeVInt(1); // set low bit of doc num.
277 else {
278 freq.writeVInt(0); // the document number
279 freq.writeVInt(postingFreq); // frequency in doc
280 }
281
282 int lastPosition = 0; // write positions
283 int[] positions = posting.positions;
284 for (int j = 0; j < postingFreq; j++) { // use delta-encoding
285 int position = positions[j];
286 prox.writeVInt(position - lastPosition);
287 lastPosition = position;
288 }
289 // check to see if we switched to a new field
290 String termField = posting.term.field();
291 if (currentField != termField) {
292 // changing field - see if there is something to save
293 currentField = termField;
294 FieldInfo fi = fieldInfos.fieldInfo(currentField);
295 if (fi.storeTermVector) {
296 if (termVectorWriter == null) {
297 termVectorWriter =
298 new TermVectorsWriter(directory, segment, fieldInfos);
299 termVectorWriter.openDocument();
300 }
301 termVectorWriter.openField(currentField);
302 } else if (termVectorWriter != null) {
303 termVectorWriter.closeField();
304 }
305 }
306
307 if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
308 /* Niraj */ termVectorWriter.addTerm(posting.term.text(), postingFreq);
309 //termVerctorWriter.addTerm(posting.term,postingFreq);
310 }
311 }
312 if (termVectorWriter != null)
313 termVectorWriter.closeDocument();
314 } catch (Exception e) {
315 e.printStackTrace();
316 }
317 finally {
318 // make an effort to close all streams we can but remember and re-throw
319 // the first exception encountered in this process
320 IOException keep = null;
321 if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
322 if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
323 if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
324 if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
325 if (keep != null) throw (IOException) keep.fillInStackTrace();
326 }
327 }
328
329 private final void writeNorms(Document doc, String segment) throws IOException {
330 for(int n = 0; n < fieldInfos.size(); n++){
331 FieldInfo fi = fieldInfos.fieldInfo(n);
332 if(fi.isIndexed){
333 float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
334 OutputStream norms = directory.createFile(segment + ".f" + n);
335 try {
336 norms.writeByte(similarity.encodeNorm(norm));
337 } finally {
338 norms.close();
339 }
340 }
341 }
342 }
343 }
344
345 final class Posting { // info about a Term in a doc
346 Term term; // the Term
347 int freq; // its frequency in doc
348 int[] positions; // positions it occurs at
349
350 Posting(Term t, int position) {
351 term = t;
352 freq = 1;
353 positions = new int[1];
354 positions[0] = position;
355 }
356 }
|