001 /*
002 * LuceneIndexer.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: LuceneIndexer.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic.lucene;
009
010 import java.io.File;
011 import java.io.IOException;
012 import java.net.URISyntaxException;
013 import java.net.URL;
014 import java.util.ArrayList;
015 import java.util.HashMap;
016 import java.util.HashSet;
017 import java.util.Iterator;
018 import java.util.List;
019 import java.util.Map;
020 import java.util.Set;
021
022 import gate.creole.annic.Constants;
023 import gate.creole.annic.IndexException;
024 import gate.creole.annic.Indexer;
025 import gate.creole.annic.apache.lucene.document.Document;
026 import gate.creole.annic.apache.lucene.index.IndexReader;
027 import gate.creole.annic.apache.lucene.index.IndexWriter;
028 import gate.creole.annic.apache.lucene.index.Term;
029 import gate.creole.annic.apache.lucene.search.Hits;
030 import gate.creole.annic.apache.lucene.search.IndexSearcher;
031 import gate.creole.annic.apache.lucene.search.TermQuery;
032 import gate.Corpus;
033 import gate.util.Files;
034
035 /**
036 * This class provides a Lucene based implementation for the Indexer
037 * interface. It asks users to provide various required parameters and
038 * creates the Lucene Index.
039 *
040 * @author niraj
041 *
042 */
043 public class LuceneIndexer implements Indexer {
044
045 protected boolean DEBUG = false;
046
047 /** An corpus for indexing */
048 protected Corpus corpus;
049
050 /**
051 * Various parameters such as location of the Index etc.
052 */
053 protected Map parameters;
054
055 /**
056 * Constructor
057 *
058 * @param indexLocationUrl
059 * @throws IOException
060 */
061 public LuceneIndexer(URL indexLocationUrl) throws IOException {
062 if(indexLocationUrl != null) {
063 readParametersFromDisk(indexLocationUrl);
064 }
065 }
066
067 /**
068 * Checks the Index Parameters to see if they are all compatible
069 */
070 protected void checkIndexParameters(Map parameters) throws IndexException {
071 this.parameters = parameters;
072
073 if(parameters == null) {
074 throw new IndexException("No parameters provided!");
075 }
076
077 URL indexLocation = (URL)parameters.get(Constants.INDEX_LOCATION_URL);
078 if(indexLocation == null)
079 throw new IndexException("You must provide a URL for INDEX_LOCATION");
080
081 if(!indexLocation.getProtocol().equalsIgnoreCase("file")) {
082 throw new IndexException(
083 "Index Output Directory must be set to the empty directory on the file system");
084 }
085
086 File file = null;
087 try {
088 file = new File(indexLocation.toURI());
089 } catch(URISyntaxException use) {
090 file = Files.fileFromURL(indexLocation);
091 }
092
093 if(file.exists()) {
094 if(!file.isDirectory()) {
095 throw new IndexException("Path doesn't exist");
096 }
097 }
098
099 String baseTokenAnnotationType = (String)parameters
100 .get(Constants.BASE_TOKEN_ANNOTATION_TYPE);
101 if(baseTokenAnnotationType.indexOf(".") > -1 || baseTokenAnnotationType.indexOf("=") > -1
102 || baseTokenAnnotationType.indexOf(";") > -1 || baseTokenAnnotationType.indexOf(",") > -1) {
103 throw new IndexException(
104 "Base token annotation type cannot have '.' , '=', ',' or ';; in it");
105 }
106
107 String indexUnitAnnotationType = (String)parameters
108 .get(Constants.INDEX_UNIT_ANNOTATION_TYPE);
109
110 if(DEBUG) {
111 System.out.println("BTAT : " + baseTokenAnnotationType);
112 System.out.println("IUAT : " + indexUnitAnnotationType);
113 }
114
115 if(baseTokenAnnotationType == null
116 || baseTokenAnnotationType.trim().length() == 0) {
117 baseTokenAnnotationType = Constants.ANNIC_TOKEN;
118 parameters.put(Constants.BASE_TOKEN_ANNOTATION_TYPE,
119 Constants.ANNIC_TOKEN);
120 }
121 }
122
123 /**
124 * Returns the indexing parameters
125 *
126 * @return
127 */
128 protected Map getIndexParameters() {
129 return this.parameters;
130 }
131
132 /**
133 * Creates index directory and indexing all documents in the corpus.
134 *
135 * @param indexParameters This is a map containing various values
136 * required to create an index In case of LuceneIndexManager
137 * following are the values required
138 * <P>
139 * INDEX_LOCATION_URL - this is a URL where the Index be
140 * created
141 * <P>
142 * BASE_TOKEN_ANNOTATION_TYPE
143 * <P>
144 * INDEX_UNIT_ANNOTATION_TYPE
145 * <P>
146 * FEATURES_TO_EXCLUDE
147 * <P>
148 * FEATURES_TO_INCLUDE
149 * <P>
150 *
151 */
152 public void createIndex(Map indexParameters) throws IndexException {
153 checkIndexParameters(indexParameters);
154 URL indexLocation = (URL)parameters.get(Constants.INDEX_LOCATION_URL);
155
156 try {
157 File file = null;
158 try {
159 file = new File(indexLocation.toURI());
160 } catch(URISyntaxException use) {
161 file = Files.fileFromURL(indexLocation);
162 }
163
164
165 // create an instance of Index Writer
166 IndexWriter writer = new IndexWriter(file.getAbsolutePath(),
167 new LuceneAnalyzer(), true);
168
169 try {
170 if(corpus != null) {
171 // load documents and add them one by one
172 for(int i = 0; i < corpus.size(); i++) {
173 gate.Document gateDoc = (gate.Document)corpus.get(i);
174 String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc
175 .getName() : gateDoc.getLRPersistenceId().toString();
176
177 System.out.print("Indexing : " + idToUse + " ...");
178 String corpusName = corpus.getLRPersistenceId() == null ? corpus
179 .getName() : corpus.getLRPersistenceId().toString();
180
181 List<gate.creole.annic.apache.lucene.document.Document> luceneDocs = getLuceneDocuments(
182 corpusName, gateDoc, indexLocation.toString());
183
184 if(luceneDocs != null) {
185 for(int j = 0; j < luceneDocs.size(); j++) {
186 if(luceneDocs.get(j) != null) {
187 writer.addDocument(luceneDocs.get(j));
188 }
189 }
190 }
191 if(gateDoc.getLRPersistenceId() != null) {
192 gate.Factory.deleteResource(gateDoc);
193 }
194 System.out.println("Done");
195 }
196 }// for (all documents)
197 }
198 finally {
199 writer.close();
200 }
201 writeParametersToDisk();
202 }
203 catch(java.io.IOException ioe) {
204 throw new IndexException(ioe);
205 }
206 }
207
208 /** Optimize existing index. */
209 public void optimizeIndex() throws IndexException {
210 try {
211 String location = ((URL)parameters.get(Constants.INDEX_LOCATION_URL))
212 .toString();
213 IndexWriter writer = new IndexWriter(location,
214 new gate.creole.annic.lucene.LuceneAnalyzer(), false);
215 try {
216 writer.optimize();
217 }
218 finally {
219 writer.close();
220 }
221 }
222 catch(java.io.IOException ioe) {
223 throw new IndexException(ioe);
224 }
225 }
226
227 /** Deletes the index. */
228 public void deleteIndex() throws IndexException {
229 boolean isDeleted = true;
230 if(parameters == null) return;
231 File dir = null;
232 try {
233 dir = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
234 .toURI());
235 } catch(URISyntaxException use) {
236 dir = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
237 .getFile());
238 }
239
240 if(dir.exists() && dir.isDirectory()) {
241 File[] files = dir.listFiles();
242 for(int i = 0; i < files.length; i++) {
243 File f = files[i];
244 if(f.isDirectory()) {
245 File[] subFiles = f.listFiles();
246 for(int j = 0; j < subFiles.length; j++) {
247 File sf = subFiles[j];
248 sf.delete();
249 }
250 }
251 f.delete();
252 }
253 }
254 isDeleted = dir.delete();
255 if(!isDeleted) {
256 throw new IndexException("Can't delete directory" + dir.getAbsolutePath());
257 }
258 }
259
260 /**
261 * Add new documents to Index
262 *
263 * @param corpusPersistenceID
264 * @param addedDocuments
265 * @throws IndexException
266 */
267 public void add(String corpusPersistenceID, List<gate.Document> added)
268 throws IndexException {
269
270 String location = null;
271 try {
272 location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
273 .toURI()).getAbsolutePath();
274 } catch(URISyntaxException use) {
275 location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
276 .getFile()).getAbsolutePath();
277 }
278
279 try {
280 IndexWriter writer = new IndexWriter(location, new LuceneAnalyzer(), false);
281
282 try {
283 if(added != null) {
284 for(int i = 0; i < added.size(); i++) {
285
286 gate.Document gateDoc = added.get(i);
287
288 String idToUse = gateDoc.getLRPersistenceId() == null ? gateDoc
289 .getName() : gateDoc.getLRPersistenceId().toString();
290 System.out.print("Indexing : " + idToUse + " ...");
291 List<gate.creole.annic.apache.lucene.document.Document> docs = getLuceneDocuments(
292 corpusPersistenceID, gateDoc, location);
293 if(docs == null) {
294 System.out.println("Done");
295 continue;
296 }
297 for(int j = 0; j < docs.size(); j++) {
298 writer.addDocument(docs.get(j));
299 }
300 System.out.println("Done");
301 }// for (add all added documents)
302 }
303 }
304 finally {
305 // make sure we close the writer, whatever happens
306 writer.close();
307 }
308 }
309 catch(java.io.IOException ioe) {
310 throw new IndexException(ioe);
311 }
312 }
313
314 private String getCompatibleName(String name) {
315 return name.replaceAll("[\\/:\\*\\?\"<>|]", "_");
316 }
317
318
319 /**
320 * remove documents from the Index
321 *
322 * @param removedDocumentPersistenceIds - when documents are not
323 * peristed, Persistence IDs will not be available In that
324 * case provide the document Names instead of their IDs
325 * @throws Exception
326 */
327 public void remove(List removedIDs) throws IndexException {
328
329 String location = null;
330 try {
331 location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
332 .toURI()).getAbsolutePath();
333 } catch(URISyntaxException use) {
334 location = new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
335 .getFile()).getAbsolutePath();
336
337 }
338
339 try {
340
341 IndexReader reader = IndexReader.open(location);
342
343 try {
344 // let us first remove the documents which need to be removed
345 if(removedIDs != null) {
346 for(int i = 0; i < removedIDs.size(); i++) {
347 String id = removedIDs.get(i).toString();
348
349 Set<String> serializedFilesIDs = getNamesOfSerializedFiles(id);
350
351 if(serializedFilesIDs.size() > 0) {
352 System.out.print("Removing => " + id + "...");
353
354 id = getCompatibleName(id);
355 File file = new File(location, Constants.SERIALIZED_FOLDER_NAME);
356 file = new File(file, id);
357
358 for(String serializedFileID : serializedFilesIDs) {
359 gate.creole.annic.apache.lucene.index.Term term = new gate.creole.annic.apache.lucene.index.Term(
360 Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, serializedFileID);
361 reader.delete(term);
362 serializedFileID = getCompatibleName(serializedFileID);
363 // deleting them from the disk as well
364 // we have a subfolder for each document
365
366 File toDelete = new File(file, serializedFileID
367 + ".annic");
368 if(toDelete.exists()) toDelete.delete();
369 }
370
371 if(file.exists() && file.isDirectory()) {
372 file.delete();
373 }
374
375 System.out.println("Done ");
376 }
377 }// for (remove all removed documents)
378 }
379 }
380 finally {
381 reader.close();
382 }
383 }
384 catch(java.io.IOException ioe) {
385 throw new IndexException(ioe);
386 }
387
388 }
389
390 /**
391 * We create a separate Lucene document for each index unit available
392 * in the gate document. An array of Lucene document is returned as a
393 * call to this method. It uses various indexing parameters set
394 * earlier.
395 *
396 * @param corpusPersistenceID
397 * @param gateDoc
398 * @param location
399 * @return
400 * @throws IndexException
401 */
402 private List<gate.creole.annic.apache.lucene.document.Document> getLuceneDocuments(
403 String corpusPersistenceID, gate.Document gateDoc, String location)
404 throws IndexException {
405 ArrayList sets_to_include = new ArrayList((List)parameters
406 .get(Constants.ANNOTATION_SETS_NAMES_TO_INCLUDE));
407 ArrayList sets_to_exclude = new ArrayList((List)parameters
408 .get(Constants.ANNOTATION_SETS_NAMES_TO_EXCLUDE));
409
410 String baseTokenAnnotationType = (String)parameters
411 .get(Constants.BASE_TOKEN_ANNOTATION_TYPE);
412
413 String indexUnitAnnotationType = (String)parameters
414 .get(Constants.INDEX_UNIT_ANNOTATION_TYPE);
415
416 ArrayList featuresToExclude = new ArrayList((List)parameters
417 .get(Constants.FEATURES_TO_EXCLUDE));
418
419 ArrayList featuresToInclude = new ArrayList((List)parameters
420 .get(Constants.FEATURES_TO_INCLUDE));
421
422 ArrayList annotationSetsToExclude = new ArrayList((List)parameters
423 .get(Constants.ANNOTATION_SETS_NAMES_TO_EXCLUDE));
424
425 ArrayList annotationSetsToInclude = new ArrayList((List)parameters
426 .get(Constants.ANNOTATION_SETS_NAMES_TO_INCLUDE));
427
428 Boolean createTokensAutomatically = (Boolean) parameters.get(Constants.CREATE_TOKENS_AUTOMATICALLY);
429 if(createTokensAutomatically == null) createTokensAutomatically = new Boolean(true);
430
431 String idToUse = gateDoc.getLRPersistenceId() == null
432 ? gateDoc.getName()
433 : gateDoc.getLRPersistenceId().toString();
434
435 return new gate.creole.annic.lucene.LuceneDocument().createDocuments(
436 corpusPersistenceID, gateDoc, idToUse, annotationSetsToInclude,
437 annotationSetsToExclude, featuresToInclude, featuresToExclude,
438 location, baseTokenAnnotationType, createTokensAutomatically, indexUnitAnnotationType);
439 }
440
441 /**
442 * Returns the corpus.
443 */
444 public Corpus getCorpus() {
445 return corpus;
446 }
447
448 /**
449 * Sets the corpus.
450 */
451 public void setCorpus(Corpus corpus) throws IndexException {
452 this.corpus = corpus;
453 if(corpus == null) {
454 throw new IndexException("Corpus is not initialized");
455 }
456
457 // we would add a feature to the corpus
458 // which will tell us if this corpus was index by the ANNIC
459 corpus.getFeatures().put(Constants.CORPUS_INDEX_FEATURE,
460 Constants.CORPUS_INDEX_FEATURE_VALUE);
461 }
462
463 /**
464 * This method, searchers for the LuceneIndexDefinition.xml file at
465 * the provided location. The file is supposed to contain all the
466 * required parameters which are used to create an index.
467 *
468 * @param indexLocationUrl
469 * @throws IOException
470 */
471 private void readParametersFromDisk(URL indexLocationUrl) throws IOException {
472 // we create a hashmap to store index definition in the index
473 // directory
474
475 File file = null;
476 try {
477 file = new File(new File(indexLocationUrl.toURI()), "LuceneIndexDefinition.xml");
478 } catch(URISyntaxException use) {
479 file = new File(indexLocationUrl.getFile(), "LuceneIndexDefinition.xml");
480 }
481
482 if(!file.exists()) return;
483
484 java.io.FileReader fileReader = new java.io.FileReader(file);
485
486 try {
487 // other wise read this and
488 com.thoughtworks.xstream.XStream xstream = new com.thoughtworks.xstream.XStream(
489 new com.thoughtworks.xstream.io.xml.StaxDriver());
490
491 // Saving is accomplished just using XML serialization of the map.
492 this.parameters = (HashMap)xstream.fromXML(fileReader);
493 // setting the index location URL
494 this.parameters.put(Constants.INDEX_LOCATION_URL, indexLocationUrl);
495 }
496 finally {
497 fileReader.close();
498 }
499 }
500
501 /**
502 * All Index parameters are stored on a disc at the
503 * index_location_url/LuceneIndexDefinition.xml file.
504 *
505 * @throws IOException
506 */
507 private void writeParametersToDisk() throws IOException {
508 // we create a hashmap to store index definition in the index
509 // directory
510 URL location = (URL)parameters.get(Constants.INDEX_LOCATION_URL);
511 File file = null;
512 try {
513 file = new File(new File(location.toURI()), "LuceneIndexDefinition.xml");
514 } catch(URISyntaxException use) {
515 file = new File(location.getFile(), "LuceneIndexDefinition.xml");
516 }
517
518 java.io.FileWriter fileWriter = new java.io.FileWriter(file);
519 HashMap indexInformation = new HashMap();
520 Iterator iter = parameters.keySet().iterator();
521 while(iter.hasNext()) {
522 Object key = iter.next();
523 if(key.equals(Constants.INDEX_LOCATION_URL)) continue;
524 indexInformation.put(key, parameters.get(key));
525 }
526
527 indexInformation.put(Constants.CORPUS_INDEX_FEATURE,
528 Constants.CORPUS_INDEX_FEATURE_VALUE);
529 if(corpus != null)
530 indexInformation.put(Constants.CORPUS_SIZE, new Integer(corpus
531 .getDocumentNames().size()));
532
533 // we would use XStream library to store annic patterns
534 com.thoughtworks.xstream.XStream xstream = new com.thoughtworks.xstream.XStream();
535
536 // Saving is accomplished just using XML serialization of
537 // the map.
538 try {
539 xstream.toXML(indexInformation, fileWriter);
540 }
541 finally {
542 fileWriter.close();
543 }
544 }
545
546 /**
547 * Returns the set parameters
548 */
549 public Map getParameters() {
550 return this.parameters;
551 }
552
553 /**
554 * This method returns a set of annotation set names that are indexed.
555 *
556 * @return
557 */
558 public Set<String> getNamesOfSerializedFiles(String documentID)
559 throws IndexException {
560 String location = null;
561 try {
562 location = new File(((URL)parameters
563 .get(Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
564 } catch(URISyntaxException use) {
565 location = new File(((URL)parameters
566 .get(Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
567 }
568
569 Set<String> toReturn = new HashSet<String>();
570 try {
571 Term term = new Term(Constants.DOCUMENT_ID, documentID);
572 TermQuery tq = new TermQuery(term);
573 gate.creole.annic.apache.lucene.search.Searcher searcher = new IndexSearcher(location);
574 try {
575 // and now execute the query
576 // result of which will be stored in hits
577 Hits luceneHits = searcher.search(tq);
578 for(int i = 0; i < luceneHits.length(); i++) {
579 Document luceneDoc = luceneHits.doc(i);
580 String documentIdOfSerializedFile = luceneDoc
581 .get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
582 toReturn.add(documentIdOfSerializedFile);
583 }
584 return toReturn;
585 }
586 finally {
587 searcher.close();
588 }
589 }
590 catch(IOException ioe) {
591 throw new IndexException(ioe);
592 }
593 }
594 }
|