001 /*
002 * LuceneIndexManager.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gaI enjoy seeing
011 the occasional update on Facebook :te.ac.uk/gate/licence.html).
012 *
013 * Rosen Marinov, 19/Apr/2002
014 *
015 */
016
017 package gate.creole.ir.lucene;
018
019 import gate.Corpus;
020 import gate.creole.ir.IndexDefinition;
021 import gate.creole.ir.IndexException;
022 import gate.creole.ir.IndexField;
023 import gate.creole.ir.IndexManager;
024 import gate.util.GateRuntimeException;
025
026 import java.io.File;
027 import java.util.Iterator;
028 import java.util.List;
029
030 import org.apache.lucene.analysis.SimpleAnalyzer;
031 import org.apache.lucene.document.Field;
032 import org.apache.lucene.index.IndexReader;
033 import org.apache.lucene.index.IndexWriter;
034 import org.apache.lucene.store.FSDirectory;
035
036 /** This class represents Lucene implementation of IndexManeager interface.*/
037 public class LuceneIndexManager implements IndexManager{
038
039 /** used in Lucene Documents as a key for gate document ID value. */
040 public final static String DOCUMENT_ID = "DOCUMENT_ID";
041
042 /** IndexDefinition - location, type, fields, etc.*/
043 private IndexDefinition indexDefinition;
044
045 /** An corpus for indexing*/
046 private Corpus corpus;
047
048 /* Niraj */
049 /** constant that ensures that corpus is indexed with IR plugin */
050 public final static String CORPUS_INDEX_FEATURE = "CorpusIndexFeature";
051 public final static String CORPUS_INDEX_FEATURE_VALUE = "IR";
052 /* End */
053
054 /** Constructor of the class. */
055 public LuceneIndexManager(){
056 }
057
058 /** Creates index directory and indexing all
059 * documents in the corpus. */
060 public void createIndex() throws IndexException{
061 if(indexDefinition == null)
062 throw new GateRuntimeException("Index definition is null!");
063 if(corpus == null)
064 throw new GateRuntimeException("Corpus is null!");
065
066 String location = indexDefinition.getIndexLocation();
067 try {
068 File file = new File(location);
069 if (file.exists()){
070 if (file.isDirectory() && file.listFiles().length>0) {
071 throw new IndexException(location+ " is not empty directory");
072 }
073 if (!file.isDirectory()){
074 throw new IndexException("Only empty directory can be index path");
075 }
076 }
077
078 /* Niraj */
079 // ok so lets put the corpus index feature
080 corpus.getFeatures().put(CORPUS_INDEX_FEATURE, CORPUS_INDEX_FEATURE_VALUE);
081 /* End */
082
083 IndexWriter writer = new IndexWriter(
084 FSDirectory.open(new File(location)),
085 new SimpleAnalyzer(),
086 true,
087 new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)
088 );
089
090 for(int i = 0; i<corpus.size(); i++) {
091 boolean isLoaded = corpus.isDocumentLoaded(i);
092 gate.Document gateDoc = (gate.Document) corpus.get(i);
093 writer.addDocument(getLuceneDoc(gateDoc));
094 if (!isLoaded) {
095 corpus.unloadDocument(gateDoc);
096 gate.Factory.deleteResource(gateDoc);
097 }
098 }//for (all documents)
099
100 writer.commit();
101 writer.close();
102 corpus.sync();
103 } catch (java.io.IOException ioe){
104 throw new IndexException(ioe.getMessage());
105 } catch (gate.persist.PersistenceException pe){
106 pe.printStackTrace();
107 } catch (gate.security.SecurityException se){
108 se.printStackTrace();
109 }
110 }
111
112 /** Optimize existing index. */
113 public void optimizeIndex() throws IndexException{
114 if(indexDefinition == null)
115 throw new GateRuntimeException("Index definition is null!");
116 try {
117 IndexWriter writer = new IndexWriter(
118 FSDirectory.open(new File(indexDefinition.getIndexLocation())),
119 new SimpleAnalyzer(),
120 false,
121 new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
122 writer.optimize();
123 writer.commit();
124 writer.close();
125 } catch (java.io.IOException ioe){
126 throw new IndexException(ioe.getMessage());
127 }
128 }
129
130 /** Delete index. */
131 public void deleteIndex() throws IndexException{
132 if(indexDefinition == null)
133 throw new GateRuntimeException("Index definition is null!");
134 boolean isDeleted = true;
135 File dir = new File(indexDefinition.getIndexLocation());
136 if (dir.exists() && dir.isDirectory()) {
137 File[] files = dir.listFiles();
138 for (int i =0; i<files.length; i++){
139 File f = files[i];
140 isDeleted = f.delete();
141 }
142 }
143 dir.delete();
144 if (!isDeleted) {
145 throw new IndexException("Can't delete directory"
146 + indexDefinition.getIndexLocation());
147 }
148 }
149
150 /** Reindexing changed documents, removing removed documents and
151 * add to the index new corpus documents. */
152 public void sync(List added, List removedIDs, List changed) throws IndexException{
153 String location = indexDefinition.getIndexLocation();
154 try {
155
156 IndexReader reader = IndexReader.open(FSDirectory.open(new File(location)),false);
157
158 for (int i = 0; i<removedIDs.size(); i++) {
159 String id = removedIDs.get(i).toString();
160 org.apache.lucene.index.Term term =
161 new org.apache.lucene.index.Term(DOCUMENT_ID,id);
162 reader.deleteDocuments(term);
163 }//for (remove all removed documents)
164
165 for (int i = 0; i<changed.size(); i++) {
166 gate.Document gateDoc = (gate.Document) changed.get(i);
167 String id = gateDoc.getLRPersistenceId().toString();
168 org.apache.lucene.index.Term term =
169 new org.apache.lucene.index.Term(DOCUMENT_ID,id);
170 reader.deleteDocuments(term);
171 }//for (remove all changed documents)
172
173 reader.close();
174
175 IndexWriter writer = new IndexWriter(
176 FSDirectory.open(new File(location)),
177 new SimpleAnalyzer(),
178 false,
179 new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)
180 );
181
182 for(int i = 0; i<added.size(); i++) {
183 gate.Document gateDoc = (gate.Document) added.get(i);
184 writer.addDocument(getLuceneDoc(gateDoc));
185 }//for (add all added documents)
186
187 for(int i = 0; i<changed.size(); i++) {
188 gate.Document gateDoc = (gate.Document) changed.get(i);
189 writer.addDocument(getLuceneDoc(gateDoc));
190 }//for (add all changed documents)
191
192 writer.close();
193 } catch (java.io.IOException ioe) {
194 throw new IndexException(ioe.getMessage());
195 }
196 }
197
198 private org.apache.lucene.document.Document getLuceneDoc(gate.Document gateDoc){
199 org.apache.lucene.document.Document luceneDoc =
200 new org.apache.lucene.document.Document();
201 Iterator fields = indexDefinition.getIndexFields();
202
203 // luceneDoc.add(Field.Keyword(DOCUMENT_ID,
204 // gateDoc.getLRPersistenceId().toString()));
205
206 // update version of Lucene
207 luceneDoc.add(new Field(DOCUMENT_ID,gateDoc.getLRPersistenceId().toString(),Field.Store.YES,Field.Index.NOT_ANALYZED));
208
209 while (fields.hasNext()) {
210 IndexField field = (IndexField) fields.next();
211 String valueForIndexing;
212
213 if (field.getReader() == null){
214 valueForIndexing = gateDoc.getFeatures().get(field.getName()).toString();
215 } else {
216 valueForIndexing = field.getReader().getPropertyValue(gateDoc);
217 } //if-else reader or feature
218
219 if (field.isPreseved()) {
220 luceneDoc.add(new Field(field.getName(),valueForIndexing,Field.Store.YES,Field.Index.NOT_ANALYZED));
221 // luceneDoc.add(Field.Keyword(field.getName(),valueForIndexing));
222 } else {
223 luceneDoc.add(new Field(field.getName(),valueForIndexing,Field.Store.NO,Field.Index.ANALYZED));
224 // luceneDoc.add(Field.UnStored(field.getName(),valueForIndexing));
225 } // if-else keyword or text
226
227 }// while (add all fields)
228
229 return luceneDoc;
230 }
231
232 public Corpus getCorpus() {
233 return corpus;
234 }
235 public void setCorpus(Corpus corpus) {
236 this.corpus = corpus;
237 }
238 public IndexDefinition getIndexDefinition() {
239 return indexDefinition;
240 }
241 public void setIndexDefinition(IndexDefinition indexDefinition) {
242 this.indexDefinition = indexDefinition;
243 }
244
245 }
|