DocumentWriter.java
001 package gate.creole.annic.apache.lucene.index;
002 
003 /**
004  * Copyright 2004 The Apache Software Foundation
005  *
006  * Licensed under the Apache License, Version 2.0 (the "License");
007  * you may not use this file except in compliance with the License.
008  * You may obtain a copy of the License at
009  *
010  *     http://www.apache.org/licenses/LICENSE-2.0
011  *
012  * Unless required by applicable law or agreed to in writing, software
013  * distributed under the License is distributed on an "AS IS" BASIS,
014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015  * See the License for the specific language governing permissions and
016  * limitations under the License.
017  */
018 
019 import java.io.IOException;
020 import java.io.Reader;
021 import java.io.StringReader;
022 import java.util.Hashtable;
023 import java.util.Enumeration;
024 import java.util.Arrays;
025 
026 import gate.creole.annic.apache.lucene.document.Document;
027 import gate.creole.annic.apache.lucene.document.Field;
028 import gate.creole.annic.apache.lucene.analysis.Analyzer;
029 import gate.creole.annic.apache.lucene.analysis.TokenStream;
030 import gate.creole.annic.apache.lucene.analysis.Token;
031 import gate.creole.annic.apache.lucene.store.Directory;
032 import gate.creole.annic.apache.lucene.store.OutputStream;
033 import gate.creole.annic.apache.lucene.search.Similarity;
034 
035 final class DocumentWriter {
036   private Analyzer analyzer;
037   private Directory directory;
038   private Similarity similarity;
039   private FieldInfos fieldInfos;
040   private int maxFieldLength;
041 
042   /**
043    *
044    @param directory The directory to write the document information to
045    @param analyzer The analyzer to use for the document
046    @param similarity The Similarity function
047    @param maxFieldLength The maximum number of tokens a field may have
048    */
049   DocumentWriter(Directory directory, Analyzer analyzer,
050                  Similarity similarity, int maxFieldLength) {
051     this.directory = directory;
052     this.analyzer = analyzer;
053     this.similarity = similarity;
054     this.maxFieldLength = maxFieldLength;
055   }
056 
057   final void addDocument(String segment, Document doc)
058           throws IOException {
059     // write field names
060     fieldInfos = new FieldInfos();
061     fieldInfos.add(doc);
062     fieldInfos.write(directory, segment + ".fnm");
063 
064     // write field values
065     FieldsWriter fieldsWriter =
066             new FieldsWriter(directory, segment, fieldInfos);
067     try {
068       fieldsWriter.addDocument(doc);
069     finally {
070       fieldsWriter.close();
071     }
072 
073     // invert doc into postingTable
074     postingTable.clear();        // clear postingTable
075     fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
076     fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
077 
078     fieldBoosts = new float[fieldInfos.size()];    // init fieldBoosts
079     Arrays.fill(fieldBoosts, doc.getBoost());
080 
081     invertDocument(doc);
082 
083     // sort postingTable into an array
084     Posting[] postings = sortPostingTable();
085 
086     /*
087     for (int i = 0; i < postings.length; i++) {
088       Posting posting = postings[i];
089       System.out.print(posting.term);
090       System.out.print(" freq=" + posting.freq);
091       System.out.print(" pos=");
092       System.out.print(posting.positions[0]);
093       for (int j = 1; j < posting.freq; j++)
094   System.out.print("," + posting.positions[j]);
095       System.out.println("");
096     }
097     */
098 
099     // write postings
100     writePostings(postings, segment);
101 
102     // write norms of indexed fields
103     writeNorms(doc, segment);
104 
105   }
106 
107   // Keys are Terms, values are Postings.
108   // Used to buffer a document before it is written to the index.
109   private final Hashtable postingTable = new Hashtable();
110   private int[] fieldLengths;
111   private int[] fieldPositions;
112   private float[] fieldBoosts;
113 
114   // Tokenizes the fields of a document into Postings.
115   private final void invertDocument(Document doc)
116           throws IOException {
117     Enumeration fields = doc.fields();
118     while (fields.hasMoreElements()) {
119       Field field = (Fieldfields.nextElement();
120       String fieldName = field.name();
121       int fieldNumber = fieldInfos.fieldNumber(fieldName);
122 
123       int length = fieldLengths[fieldNumber];     // length of field
124       int position = fieldPositions[fieldNumber]// position in field
125 
126       if (field.isIndexed()) {
127         if (!field.isTokenized()) {      // un-tokenized field
128           addPosition(fieldName, field.stringValue()"Field"/*, 1*/, position++);
129           length++;
130         else {
131           Reader reader;        // find or make Reader
132           if (field.readerValue() != null)
133             reader = field.readerValue();
134           else if (field.stringValue() != null)
135             reader = new StringReader(field.stringValue());
136           else
137             throw new IllegalArgumentException
138                     ("field must have either String or Reader value");
139 
140           // Tokenize field and add to postingTable
141           TokenStream stream = analyzer.tokenStream(fieldName, reader);
142           try {
143             for (Token t = stream.next(); t != null; t = stream.next()) {
144               position += (t.getPositionIncrement() 1);
145               if(t.type() == null)
146                 addPosition(fieldName, t.termText()"*"/*, t.getPositionIncrement()*/, position++);
147               else
148                 addPosition(fieldName, t.termText(), t.type()/*, t.getPositionIncrement()*/, position++);
149               if (++length > maxFieldLengthbreak;
150             }
151           finally {
152             stream.close();
153           }
154         }
155 
156         fieldLengths[fieldNumber= length;    // save field length
157         fieldPositions[fieldNumber= position;    // save field position
158         fieldBoosts[fieldNumber*= field.getBoost();
159       }
160     }
161   }
162 
163   private final Term termBuffer = new Term("""""")/*, 0)*/// avoid consing
164 
165   private final void addPosition(String field, String text, String type/*, int posIncrement*/int position) {
166 
167     termBuffer.set(field, text, type/*, posIncrement*/);
168 
169     Posting ti = (PostingpostingTable.get(termBuffer);
170     if (ti != null) {          // word seen before
171       int freq = ti.freq;
172       if (ti.positions.length == freq) {    // positions array is full
173         int[] newPositions = new int[freq * 2];    // double size
174         int[] positions = ti.positions;
175         for (int i = 0; i < freq; i++)      // copy old positions to new
176           newPositions[i= positions[i];
177         ti.positions = newPositions;
178       }
179       ti.positions[freq= position;      // add new position
180       ti.freq = freq + 1;        // update frequency
181     else {            // word not seen before
182       Term term = new Term(field, text, type/*, posIncrement*/false);
183       postingTable.put(term, new Posting(term, position));
184     }
185   }
186 
187   private final Posting[] sortPostingTable() {
188     // copy postingTable into an array
189     Posting[] array = new Posting[postingTable.size()];
190     Enumeration postings = postingTable.elements();
191     for (int i = 0; postings.hasMoreElements(); i++)
192       array[i(Postingpostings.nextElement();
193 
194     // sort the array
195     quickSort(array, 0, array.length - 1);
196 
197     return array;
198   }
199 
200   private static final void quickSort(Posting[] postings, int lo, int hi) {
201     if (lo >= hi)
202       return;
203 
204     int mid = (lo + hi>>> 1;
205 
206     if (postings[lo].term.compareTo(postings[mid].term0) {
207       Posting tmp = postings[lo];
208       postings[lo= postings[mid];
209       postings[mid= tmp;
210     }
211 
212     if (postings[mid].term.compareTo(postings[hi].term0) {
213       Posting tmp = postings[mid];
214       postings[mid= postings[hi];
215       postings[hi= tmp;
216 
217       if (postings[lo].term.compareTo(postings[mid].term0) {
218         Posting tmp2 = postings[lo];
219         postings[lo= postings[mid];
220         postings[mid= tmp2;
221       }
222     }
223 
224     int left = lo + 1;
225     int right = hi - 1;
226 
227     if (left >= right)
228       return;
229 
230     Term partition = postings[mid].term;
231 
232     for (; ;) {
233       while (postings[right].term.compareTo(partition0)
234         --right;
235 
236       while (left < right && postings[left].term.compareTo(partition<= 0)
237         ++left;
238 
239       if (left < right) {
240         Posting tmp = postings[left];
241         postings[left= postings[right];
242         postings[right= tmp;
243         --right;
244       else {
245         break;
246       }
247     }
248 
249     quickSort(postings, lo, left);
250     quickSort(postings, left + 1, hi);
251   }
252 
253   private final void writePostings(Posting[] postings, String segment)
254           throws IOException {
255     OutputStream freq = null, prox = null;
256     TermInfosWriter tis = null;
257     TermVectorsWriter termVectorWriter = null;
258     try {
259       //open files for inverse index storage
260       freq = directory.createFile(segment + ".frq");
261       prox = directory.createFile(segment + ".prx");
262       tis = new TermInfosWriter(directory, segment, fieldInfos);
263       TermInfo ti = new TermInfo();
264       String currentField = null;
265       for (int i = 0; i < postings.length; i++) {
266         Posting posting = postings[i];
267 
268         // add an entry to the dictionary with pointers to prox and freq files
269         ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
270         //System.out.println("Adding term");
271         tis.add(posting.term, ti);
272 
273         // add an entry to the freq file
274         int postingFreq = posting.freq;
275         if (postingFreq == 1)          // optimize freq=1
276           freq.writeVInt(1);        // set low bit of doc num.
277         else {
278           freq.writeVInt(0);        // the document number
279           freq.writeVInt(postingFreq);        // frequency in doc
280         }
281 
282         int lastPosition = 0;        // write positions
283         int[] positions = posting.positions;
284         for (int j = 0; j < postingFreq; j++) {      // use delta-encoding
285           int position = positions[j];
286           prox.writeVInt(position - lastPosition);
287           lastPosition = position;
288         }
289         // check to see if we switched to a new field
290         String termField = posting.term.field();
291         if (currentField != termField) {
292           // changing field - see if there is something to save
293           currentField = termField;
294           FieldInfo fi = fieldInfos.fieldInfo(currentField);
295           if (fi.storeTermVector) {
296             if (termVectorWriter == null) {
297               termVectorWriter =
298                 new TermVectorsWriter(directory, segment, fieldInfos);
299               termVectorWriter.openDocument();
300             }
301             termVectorWriter.openField(currentField);
302           else if (termVectorWriter != null) {
303             termVectorWriter.closeField();
304           }
305         }
306 
307         if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
308           /* Niraj */ termVectorWriter.addTerm(posting.term.text(), postingFreq);
309           //termVerctorWriter.addTerm(posting.term,postingFreq);
310         }
311       }
312       if (termVectorWriter != null)
313         termVectorWriter.closeDocument();
314     catch (Exception e) {
315       e.printStackTrace();
316     }
317     finally {
318       // make an effort to close all streams we can but remember and re-throw
319       // the first exception encountered in this process
320       IOException keep = null;
321       if (freq != nulltry freq.close()catch (IOException e) { if (keep == nullkeep = e; }
322       if (prox != nulltry prox.close()catch (IOException e) { if (keep == nullkeep = e; }
323       if (tis  != nulltry {  tis.close()catch (IOException e) { if (keep == nullkeep = e; }
324       if (termVectorWriter  != nulltry {  termVectorWriter.close()catch (IOException e) { if (keep == nullkeep = e; }
325       if (keep != nullthrow (IOExceptionkeep.fillInStackTrace();
326     }
327   }
328 
329   private final void writeNorms(Document doc, String segmentthrows IOException {
330     for(int n = 0; n < fieldInfos.size(); n++){
331       FieldInfo fi = fieldInfos.fieldInfo(n);
332       if(fi.isIndexed){
333         float norm = fieldBoosts[n* similarity.lengthNorm(fi.name, fieldLengths[n]);
334         OutputStream norms = directory.createFile(segment + ".f" + n);
335         try {
336           norms.writeByte(similarity.encodeNorm(norm));
337         finally {
338           norms.close();
339         }
340       }
341     }
342   }
343 }
344 
345 final class Posting {          // info about a Term in a doc
346   Term term;            // the Term
347   int freq;            // its frequency in doc
348   int[] positions;          // positions it occurs at
349 
350   Posting(Term t, int position) {
351     term = t;
352     freq = 1;
353     positions = new int[1];
354     positions[0= position;
355   }
356 }