001 package gate.creole.annic.apache.lucene.index;
002
003 import gate.creole.annic.apache.lucene.store.Directory;
004 import gate.creole.annic.apache.lucene.store.InputStream;
005
006 import java.io.IOException;
007
008 /** TODO: relax synchro!
009 */
010 class TermVectorsReader {
011 private FieldInfos fieldInfos;
012
013 private InputStream tvx;
014 private InputStream tvd;
015 private InputStream tvf;
016 private int size;
017
018 TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
019 throws IOException {
020 if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
021 tvx = d.openFile(segment + TermVectorsWriter.TVX_EXTENSION);
022 checkValidFormat(tvx);
023 tvd = d.openFile(segment + TermVectorsWriter.TVD_EXTENSION);
024 checkValidFormat(tvd);
025 tvf = d.openFile(segment + TermVectorsWriter.TVF_EXTENSION);
026 checkValidFormat(tvf);
027 size = (int) tvx.length() / 8;
028 }
029
030 this.fieldInfos = fieldInfos;
031 }
032
033 private void checkValidFormat(InputStream in) throws IOException
034 {
035 int format = in.readInt();
036 if (format > TermVectorsWriter.FORMAT_VERSION)
037 {
038 throw new IOException("Incompatible format version: " + format + " expected "
039 + TermVectorsWriter.FORMAT_VERSION + " or less");
040 }
041
042 }
043
044 synchronized void close() throws IOException {
045 // why don't we trap the exception and at least make sure that
046 // all streams that we can close are closed?
047 if (tvx != null) tvx.close();
048 if (tvd != null) tvd.close();
049 if (tvf != null) tvf.close();
050 }
051
052 /**
053 *
054 * @return The number of documents in the reader
055 */
056 int size() {
057 return size;
058 }
059
060 /**
061 * Retrieve the term vector for the given document and field
062 * @param docNum The document number to retrieve the vector for
063 * @param field The field within the document to retrieve
064 * @return The TermFreqVector for the document and field or null
065 */
066 synchronized TermFreqVector get(int docNum, String field) {
067 // Check if no term vectors are available for this segment at all
068 int fieldNumber = fieldInfos.fieldNumber(field);
069 TermFreqVector result = null;
070 if (tvx != null) {
071 try {
072 //We need to account for the FORMAT_SIZE at when seeking in the tvx
073 //We don't need to do this in other seeks because we already have the file pointer
074 //that was written in another file
075 tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
076 //System.out.println("TVX Pointer: " + tvx.getFilePointer());
077 long position = tvx.readLong();
078
079 tvd.seek(position);
080 int fieldCount = tvd.readVInt();
081 //System.out.println("Num Fields: " + fieldCount);
082 // There are only a few fields per document. We opt for a full scan
083 // rather then requiring that they be ordered. We need to read through
084 // all of the fields anyway to get to the tvf pointers.
085 int number = 0;
086 int found = -1;
087 for (int i = 0; i < fieldCount; i++) {
088 number += tvd.readVInt();
089 if (number == fieldNumber) found = i;
090 }
091
092 // This field, although valid in the segment, was not found in this document
093 if (found != -1) {
094 // Compute position in the tvf file
095 position = 0;
096 for (int i = 0; i <= found; i++)
097 {
098 position += tvd.readVLong();
099 }
100 result = readTermVector(field, position);
101 }
102 else {
103 //System.out.println("Field not found");
104 }
105
106 } catch (Exception e) {
107 //e.printStackTrace();
108 }
109 }
110 else
111 {
112 System.out.println("No tvx file");
113 }
114 return result;
115 }
116
117
118 /** Return all term vectors stored for this document or null if the could not be read in. */
119 synchronized TermFreqVector[] get(int docNum) {
120 TermFreqVector[] result = null;
121 // Check if no term vectors are available for this segment at all
122 if (tvx != null) {
123 try {
124 //We need to offset by
125 tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
126 long position = tvx.readLong();
127
128 tvd.seek(position);
129 int fieldCount = tvd.readVInt();
130
131 // No fields are vectorized for this document
132 if (fieldCount != 0) {
133 int number = 0;
134 String[] fields = new String[fieldCount];
135
136 for (int i = 0; i < fieldCount; i++) {
137 number += tvd.readVInt();
138 fields[i] = fieldInfos.fieldName(number);
139 }
140
141 // Compute position in the tvf file
142 position = 0;
143 long[] tvfPointers = new long[fieldCount];
144 for (int i = 0; i < fieldCount; i++) {
145 position += tvd.readVLong();
146 tvfPointers[i] = position;
147 }
148
149 result = readTermVectors(fields, tvfPointers);
150 }
151 } catch (IOException e) {
152 e.printStackTrace();
153 }
154 }
155 else
156 {
157 System.out.println("No tvx file");
158 }
159 return result;
160 }
161
162
163 private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
164 throws IOException {
165 SegmentTermVector res[] = new SegmentTermVector[fields.length];
166 for (int i = 0; i < fields.length; i++) {
167 res[i] = readTermVector(fields[i], tvfPointers[i]);
168 }
169 return res;
170 }
171
172 /**
173 *
174 * @param fieldNum The field to read in
175 * @param tvfPointer The pointer within the tvf file where we should start reading
176 * @return The TermVector located at that position
177 * @throws IOException
178 */
179 private SegmentTermVector readTermVector(String field, long tvfPointer)
180 throws IOException {
181
182 // Now read the data from specified position
183 //We don't need to offset by the FORMAT here since the pointer already includes the offset
184 tvf.seek(tvfPointer);
185
186 int numTerms = tvf.readVInt();
187 //System.out.println("Num Terms: " + numTerms);
188 // If no terms - return a constant empty termvector
189 if (numTerms == 0) return new SegmentTermVector(field, null, null);
190
191 int length = numTerms + tvf.readVInt();
192
193 String terms[] = new String[numTerms];
194
195 int termFreqs[] = new int[numTerms];
196
197 int start = 0;
198 int deltaLength = 0;
199 int totalLength = 0;
200 char [] buffer = {};
201 String previousString = "";
202 for (int i = 0; i < numTerms; i++) {
203 start = tvf.readVInt();
204 deltaLength = tvf.readVInt();
205 totalLength = start + deltaLength;
206 if (buffer.length < totalLength)
207 {
208 buffer = new char[totalLength];
209 for (int j = 0; j < previousString.length(); j++) // copy contents
210 buffer[j] = previousString.charAt(j);
211 }
212 tvf.readChars(buffer, start, deltaLength);
213 terms[i] = new String(buffer, 0, totalLength);
214 previousString = terms[i];
215 termFreqs[i] = tvf.readVInt();
216 }
217 SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
218 return tv;
219 }
220
221 }
|