001 package gate.creole.annic.apache.lucene.index;
002
003 /**
004 * Copyright 2004 The Apache Software Foundation
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 import gate.creole.annic.apache.lucene.store.Directory;
020 import gate.creole.annic.apache.lucene.store.OutputStream;
021 import gate.creole.annic.apache.lucene.store.InputStream;
022 import java.util.LinkedList;
023 import java.util.HashSet;
024 import java.util.Iterator;
025 import java.io.IOException;
026
027
028 /**
029 * Combines multiple files into a single compound file.
030 * The file format:<br>
031 * <ul>
032 * <li>VInt fileCount</li>
033 * <li>{Directory}
034 * fileCount entries with the following structure:</li>
035 * <ul>
036 * <li>long dataOffset</li>
037 * <li>UTFString extension</li>
038 * </ul>
039 * <li>{File Data}
040 * fileCount entries with the raw data of the corresponding file</li>
041 * </ul>
042 *
043 * The fileCount integer indicates how many files are contained in this compound
044 * file. The {directory} that follows has that many entries. Each directory entry
045 * contains an encoding identifier, an long pointer to the start of this file's
046 * data section, and a UTF String with that file's extension.
047 *
048 * @author Dmitry Serebrennikov
049 * @version $Id: CompoundFileWriter.java 529 2004-10-05 11:55:26Z niraj $
050 */
051 final class CompoundFileWriter {
052
053 private static final class FileEntry {
054 /** source file */
055 String file;
056
057 /** temporary holder for the start of directory entry for this file */
058 long directoryOffset;
059
060 /** temporary holder for the start of this file's data section */
061 long dataOffset;
062 }
063
064
065 private Directory directory;
066 private String fileName;
067 private HashSet ids;
068 private LinkedList entries;
069 private boolean merged = false;
070
071
072 /** Create the compound stream in the specified file. The file name is the
073 * entire name (no extensions are added).
074 */
075 public CompoundFileWriter(Directory dir, String name) {
076 if (dir == null)
077 throw new IllegalArgumentException("Missing directory");
078 if (name == null)
079 throw new IllegalArgumentException("Missing name");
080
081 directory = dir;
082 fileName = name;
083 ids = new HashSet();
084 entries = new LinkedList();
085 }
086
087 /** Returns the directory of the compound file. */
088 public Directory getDirectory() {
089 return directory;
090 }
091
092 /** Returns the name of the compound file. */
093 public String getName() {
094 return fileName;
095 }
096
097 /** Add a source stream. If sourceDir is null, it is set to the
098 * same value as the directory where this compound stream exists.
099 * The id is the string by which the sub-stream will be know in the
100 * compound stream. The caller must ensure that the ID is unique. If the
101 * id is null, it is set to the name of the source file.
102 */
103 public void addFile(String file) {
104 if (merged)
105 throw new IllegalStateException(
106 "Can't add extensions after merge has been called");
107
108 if (file == null)
109 throw new IllegalArgumentException(
110 "Missing source file");
111
112 if (! ids.add(file))
113 throw new IllegalArgumentException(
114 "File " + file + " already added");
115
116 FileEntry entry = new FileEntry();
117 entry.file = file;
118 entries.add(entry);
119 }
120
121 /** Merge files with the extensions added up to now.
122 * All files with these extensions are combined sequentially into the
123 * compound stream. After successful merge, the source files
124 * are deleted.
125 */
126 public void close() throws IOException {
127 if (merged)
128 throw new IllegalStateException(
129 "Merge already performed");
130
131 if (entries.isEmpty())
132 throw new IllegalStateException(
133 "No entries to merge have been defined");
134
135 merged = true;
136
137 // open the compound stream
138 OutputStream os = null;
139 try {
140 os = directory.createFile(fileName);
141
142 // Write the number of entries
143 os.writeVInt(entries.size());
144
145 // Write the directory with all offsets at 0.
146 // Remember the positions of directory entries so that we can
147 // adjust the offsets later
148 Iterator it = entries.iterator();
149 while(it.hasNext()) {
150 FileEntry fe = (FileEntry) it.next();
151 fe.directoryOffset = os.getFilePointer();
152 os.writeLong(0); // for now
153 os.writeString(fe.file);
154 }
155
156 // Open the files and copy their data into the stream.
157 // Remeber the locations of each file's data section.
158 byte buffer[] = new byte[1024];
159 it = entries.iterator();
160 while(it.hasNext()) {
161 FileEntry fe = (FileEntry) it.next();
162 fe.dataOffset = os.getFilePointer();
163 copyFile(fe, os, buffer);
164 }
165
166 // Write the data offsets into the directory of the compound stream
167 it = entries.iterator();
168 while(it.hasNext()) {
169 FileEntry fe = (FileEntry) it.next();
170 os.seek(fe.directoryOffset);
171 os.writeLong(fe.dataOffset);
172 }
173
174 // Close the output stream. Set the os to null before trying to
175 // close so that if an exception occurs during the close, the
176 // finally clause below will not attempt to close the stream
177 // the second time.
178 OutputStream tmp = os;
179 os = null;
180 tmp.close();
181
182 } finally {
183 if (os != null) try { os.close(); } catch (IOException e) { }
184 }
185 }
186
187 /** Copy the contents of the file with specified extension into the
188 * provided output stream. Use the provided buffer for moving data
189 * to reduce memory allocation.
190 */
191 private void copyFile(FileEntry source, OutputStream os, byte buffer[])
192 throws IOException
193 {
194 InputStream is = null;
195 try {
196 long startPtr = os.getFilePointer();
197
198 is = directory.openFile(source.file);
199 long length = is.length();
200 long remainder = length;
201 int chunk = buffer.length;
202
203 while(remainder > 0) {
204 int len = (int) Math.min(chunk, remainder);
205 is.readBytes(buffer, 0, len);
206 os.writeBytes(buffer, len);
207 remainder -= len;
208 }
209
210 // Verify that remainder is 0
211 if (remainder != 0)
212 throw new IOException(
213 "Non-zero remainder length after copying: " + remainder
214 + " (id: " + source.file + ", length: " + length
215 + ", buffer size: " + chunk + ")");
216
217 // Verify that the output length diff is equal to original file
218 long endPtr = os.getFilePointer();
219 long diff = endPtr - startPtr;
220 if (diff != length)
221 throw new IOException(
222 "Difference in the output file offsets " + diff
223 + " does not match the original file length " + length);
224
225 } finally {
226 if (is != null) is.close();
227 }
228 }
229 }
|