001 /*
002 * LuceneSearcher.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: LuceneSearcher.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic.lucene;
009
010 import java.io.File;
011 import java.io.IOException;
012 import java.net.URISyntaxException;
013 import java.net.URL;
014 import java.util.ArrayList;
015 import java.util.HashMap;
016 import java.util.HashSet;
017 import java.util.List;
018 import java.util.Map;
019 import java.util.Set;
020
021 import gate.creole.annic.Hit;
022 import gate.creole.annic.Pattern;
023 import gate.creole.annic.Constants;
024 import gate.creole.annic.SearchException;
025 import gate.creole.annic.Searcher;
026 import gate.creole.annic.apache.lucene.document.Document;
027 import gate.creole.annic.apache.lucene.index.IndexReader;
028 import gate.creole.annic.apache.lucene.index.Term;
029 import gate.creole.annic.apache.lucene.index.TermEnum;
030 import gate.creole.annic.apache.lucene.search.BooleanQuery;
031 import gate.creole.annic.apache.lucene.search.Hits;
032 import gate.creole.annic.apache.lucene.search.IndexSearcher;
033 import gate.creole.annic.apache.lucene.search.TermQuery;
034 import gate.persist.LuceneDataStoreImpl;
035
036 /**
037 * This class provides the Searching functionality for annic.
038 *
039 * @author niraj
040 *
041 */
042 public class LuceneSearcher implements Searcher {
043
044 /**
045 * A List of index locations. It allows searching at multiple locations.
046 */
047 private List<String> indexLocations = null;
048
049 /**
050 * The submitted query.
051 */
052 private String query = null;
053
054 /**
055 * The number of base token annotations to show in left and right context of
056 * the pattern. By default 5.
057 */
058 private int contextWindow = 5;
059
060 /**
061 * Found patterns.
062 */
063 private List<Pattern> annicPatterns = new ArrayList<Pattern>();
064
065 /**
066 * Found annotation types in the annic patterns. The maps keeps record of
067 * found annotation types and features for each of them.
068 */
069 public Map<String, List<String>> annotationTypesMap =
070 new HashMap<String, List<String>>();
071
072 /**
073 * Search parameters.
074 */
075 private Map<String, Object> parameters = null;
076
077 /**
078 * Corpus to search in.
079 */
080 private String corpusToSearchIn = null;
081
082 /**
083 * Annotation set to search in.
084 */
085 private String annotationSetToSearchIn = null;
086
087 /**
088 * Hits returned by the lucene.
089 */
090 private Hits luceneHits = null;
091
092 /**
093 * Indicates if the query was to delete certain documents.
094 */
095 private boolean wasDeleteQuery = false;
096
097 /**
098 * A query can result into multiple queries. For example: (A|B)C is converted
099 * into two queries: AC and AD. For each query a separate thread is started.
100 */
101 private List<LuceneSearchThread> luceneSearchThreads = null;
102
103 /**
104 * Indicates if the search was successful.
105 */
106 private boolean success = false;
107
108 /**
109 * Tells which thread to use to retrieve results from.
110 */
111 private int luceneSearchThreadIndex = 0;
112
113 /**
114 * Tells if we have reached at the end of of found results.
115 */
116 private boolean fwdIterationEnded = false;
117
118 /**
119 * Used with freq method for statistics.
120 */
121 private LuceneDataStoreImpl datastore;
122
123 /**
124 * Return the next numberOfHits -1 indicates all
125 *
126 * @return
127 */
128 public Hit[] next(int numberOfHits) throws SearchException {
129
130 annicPatterns = new ArrayList<Pattern>();
131
132 if(!success) {
133 this.annicPatterns = new ArrayList<Pattern>();
134 return getHits();
135 }
136
137 if(fwdIterationEnded) {
138 this.annicPatterns = new ArrayList<Pattern>();
139 return getHits();
140 }
141
142 try {
143 if(wasDeleteQuery) {
144 List<String> docIDs = new ArrayList<String>();
145 List<String> setNames = new ArrayList<String>();
146 for(int i = 0; i < luceneHits.length(); i++) {
147 Document luceneDoc = luceneHits.doc(i);
148 String documentID = luceneDoc.get(Constants.DOCUMENT_ID);
149 String annotationSetID = luceneDoc.get(Constants.ANNOTATION_SET_ID);
150 int index = docIDs.indexOf(documentID);
151 if(index == -1) {
152 docIDs.add(documentID);
153 setNames.add(annotationSetID);
154 }
155 else {
156 if(!setNames.get(index).equals(annotationSetID)) {
157 docIDs.add(documentID);
158 setNames.add(annotationSetID);
159 }
160 }
161 }
162
163 Hit[] toReturn = new Hit[docIDs.size()];
164 for(int i = 0; i < toReturn.length; i++) {
165 toReturn[i] = new Hit(docIDs.get(i), setNames.get(i), 0, 0, "");
166 }
167 return toReturn;
168 }
169
170 for(; luceneSearchThreadIndex < luceneSearchThreads.size(); luceneSearchThreadIndex++) {
171 LuceneSearchThread lst =
172 luceneSearchThreads.get(luceneSearchThreadIndex);
173 List<Pattern> results = lst.next(numberOfHits);
174 if(results != null) {
175 if(numberOfHits != -1) {
176 numberOfHits -= results.size();
177 }
178
179 this.annicPatterns.addAll(results);
180 if(numberOfHits == 0) { return getHits(); }
181 }
182 }
183
184 // if we are here, there wer no sufficient patterns available
185 // so what we do is make success to false so that this method
186 // return null on next call
187 fwdIterationEnded = true;
188 return getHits();
189 }
190 catch(Exception e) {
191 throw new SearchException(e);
192 }
193 }
194
195 /**
196 * Method retunrs true/false indicating whether results were found or not.
197 */
198 public boolean search(String query, Map<String, Object> parameters)
199 throws SearchException {
200 luceneHits = null;
201 annicPatterns = new ArrayList<Pattern>();
202 annotationTypesMap = new HashMap<String, List<String>>();
203 luceneSearchThreads = new ArrayList<LuceneSearchThread>();
204 luceneSearchThreadIndex = 0;
205 success = false;
206 fwdIterationEnded = false;
207 wasDeleteQuery = false;
208
209 if(parameters == null)
210 throw new SearchException("Parameters cannot be null");
211
212 this.parameters = parameters;
213
214 /*
215 * lets first check if the query is to search the document names This is
216 * used when we only wants to search for documents stored under the specific
217 * corpus
218 */
219 if(parameters.size() == 2
220 && parameters.get(Constants.INDEX_LOCATION_URL) != null) {
221 String corpusID = (String)parameters.get(Constants.CORPUS_ID);
222 String indexLocation = null;
223 try {
224 indexLocation =
225 new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL)).toURI())
226 .getAbsolutePath();
227 }
228 catch(URISyntaxException use) {
229 indexLocation =
230 new File(((URL)parameters.get(Constants.INDEX_LOCATION_URL))
231 .getFile()).getAbsolutePath();
232 }
233
234 if(corpusID != null && indexLocation != null) {
235 wasDeleteQuery = true;
236 Term term = new Term(Constants.CORPUS_ID, corpusID);
237 TermQuery tq = new TermQuery(term);
238 try {
239 gate.creole.annic.apache.lucene.search.Searcher searcher =
240 new IndexSearcher(indexLocation);
241 // and now execute the query
242 // result of which will be stored in hits
243 luceneHits = searcher.search(tq);
244 success = luceneHits.length() > 0 ? true : false;
245 return success;
246 }
247 catch(IOException ioe) {
248 ioe.printStackTrace();
249 throw new SearchException(ioe);
250 }
251 }
252 }
253
254 // check for index locations
255 if(parameters.get(Constants.INDEX_LOCATIONS) == null) {
256 String indexLocation;
257 try {
258 indexLocation =
259 new File(((URL)datastore.getIndexer().getParameters().get(
260 Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
261
262 }
263 catch(URISyntaxException use) {
264 indexLocation =
265 new File(((URL)datastore.getIndexer().getParameters().get(
266 Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
267 }
268 ArrayList<String> indexLocations = new ArrayList<String>();
269 indexLocations.add(indexLocation);
270 parameters.put(Constants.INDEX_LOCATIONS, indexLocations);
271 }
272
273 indexLocations =
274 new ArrayList<String>((List<? extends String>)parameters
275 .get(Constants.INDEX_LOCATIONS));
276
277 if(indexLocations.size() == 0)
278 throw new SearchException("Corpus is not initialized");
279
280 // check for valid context window
281 if(parameters.get(Constants.CONTEXT_WINDOW) == null)
282 throw new SearchException("Parameter " + Constants.CONTEXT_WINDOW
283 + " is not provided!");
284
285 contextWindow =
286 ((Integer)parameters.get(Constants.CONTEXT_WINDOW)).intValue();
287
288 if(getContextWindow().intValue() <= 0)
289 throw new SearchException("Context Window must be atleast 1 or > 1");
290
291 if(query == null) throw new SearchException("Query is not initialized");
292
293 this.query = query;
294 this.corpusToSearchIn = (String)parameters.get(Constants.CORPUS_ID);
295 this.annotationSetToSearchIn =
296 (String)parameters.get(Constants.ANNOTATION_SET_ID);
297
298 annicPatterns = new ArrayList<Pattern>();
299 annotationTypesMap = new HashMap<String, List<String>>();
300
301 luceneSearchThreads = new ArrayList<LuceneSearchThread>();
302
303 // for different indexes, we create a different instance of
304 // indexSearcher
305 // TODO: is this really useful or used to have several indexLocations ?
306 for(int indexCounter = 0; indexCounter < indexLocations.size(); indexCounter++) {
307 String location = indexLocations.get(indexCounter);
308 // we create a separate Thread for each index
309 LuceneSearchThread lst = new LuceneSearchThread();
310 if(lst.search(query, contextWindow, location, corpusToSearchIn,
311 annotationSetToSearchIn, this)) {
312 luceneSearchThreads.add(lst);
313 }
314 }
315
316 success = luceneSearchThreads.size() > 0 ? true : false;
317 return success;
318 }
319
320
321 /**
322 * Gets the submitted query.
323 */
324 public String getQuery() {
325 return this.query;
326 }
327
328 /**
329 * Gets the number of base token annotations to show in the context.
330 *
331 * @return
332 */
333 public Integer getContextWindow() {
334 return new Integer(this.contextWindow);
335 }
336
337 /**
338 * Gets the found hits (annic patterns).
339 */
340 public Hit[] getHits() {
341 if(annicPatterns == null) annicPatterns = new ArrayList<Pattern>();
342 Hit[] hits = new Hit[annicPatterns.size()];
343 for(int i = 0; i < annicPatterns.size(); i++) {
344 hits[i] = (Pattern)annicPatterns.get(i);
345 }
346 return hits;
347 }
348
349 /**
350 * Gets the map of found annotation types and annotation features. This call
351 * must be invoked only after a call to the
352 * getIndexedAnnotationSetNames(String indexLocation) method. Otherwise this
353 * method doesn't guranttee the correct results. The results obtained has the
354 * following format. Key: CorpusName;AnnotationSetName;AnnotationType Value:
355 * respective features
356 */
357 public Map<String, List<String>> getAnnotationTypesMap() {
358 return annotationTypesMap;
359 }
360
361 /**
362 * This method returns a set of annotation set names that are indexed. Each
363 * entry has the following format:
364 * <p>
365 * corpusName;annotationSetName
366 * </p>
367 * where, the corpusName is the name of the corpus the annotationSetName
368 * belongs to.
369 *
370 * @return
371 */
372 public String[] getIndexedAnnotationSetNames() throws SearchException {
373 String indexLocation;
374 try {
375 indexLocation =
376 new File(((URL)datastore.getIndexer().getParameters().get(
377 Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
378
379 }
380 catch(URISyntaxException use) {
381 indexLocation =
382 new File(((URL)datastore.getIndexer().getParameters().get(
383 Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
384 }
385 annotationTypesMap = new HashMap<String, List<String>>();
386 Set<String> toReturn = new HashSet<String>();
387 try {
388 IndexReader reader = IndexReader.open(indexLocation);
389
390 try {
391 // lets first obtain stored corpora
392 TermEnum corpusTerms = reader.terms(new Term(Constants.CORPUS_ID, ""));
393 if(corpusTerms == null || corpusTerms.term() == null)
394 return new String[0];
395
396 Set<String> corpora = new HashSet<String>();
397 while(Constants.CORPUS_ID.equals(corpusTerms.term().field())) {
398 corpora.add(corpusTerms.term().text());
399 if(!corpusTerms.next()) break;
400 }
401
402 // for each corpus we obtain its annotation set ids
403 for(String corpus : corpora) {
404 Term term = new Term(Constants.CORPUS_ID, corpus);
405 TermQuery tq = new TermQuery(term);
406 try {
407 gate.creole.annic.apache.lucene.search.Searcher searcher =
408 new IndexSearcher(indexLocation);
409 try {
410 Hits corpusHits = searcher.search(tq);
411 for(int i = 0; i < corpusHits.length(); i++) {
412 Document luceneDoc = corpusHits.doc(i);
413 String annotationSetID =
414 luceneDoc.get(Constants.ANNOTATION_SET_ID);
415 if(toReturn.contains(corpus + ";" + annotationSetID)) continue;
416 toReturn.add(corpus + ";" + annotationSetID);
417
418 // lets create a boolean query
419 Term annotSetTerm =
420 new Term(Constants.ANNOTATION_SET_ID, annotationSetID);
421 TermQuery atq = new TermQuery(annotSetTerm);
422
423 BooleanQuery bq = new BooleanQuery();
424 bq.add(tq, true, false);
425 bq.add(atq, true, false);
426 gate.creole.annic.apache.lucene.search.Searcher indexFeatureSearcher =
427 new IndexSearcher(indexLocation);
428 try {
429 Hits indexFeaturesHits = searcher.search(bq);
430 for(int j = 0; j < indexFeaturesHits.length(); j++) {
431 Document aDoc = indexFeaturesHits.doc(j);
432 String indexedFeatures =
433 aDoc.get(Constants.INDEXED_FEATURES);
434 if(indexedFeatures != null) {
435 String[] features = indexedFeatures.split(";");
436 for(String aFeature : features) {
437 // AnnotationType.FeatureName
438 int index = aFeature.indexOf(".");
439 if(index == -1) {
440 continue;
441 }
442 String type = aFeature.substring(0, index);
443 String featureName = aFeature.substring(index + 1);
444 String key =
445 corpus + ";" + annotationSetID + ";" + type;
446 List<String> listOfFeatures =
447 annotationTypesMap.get(key);
448 if(listOfFeatures == null) {
449 listOfFeatures = new ArrayList<String>();
450 annotationTypesMap.put(key, listOfFeatures);
451 }
452 if(!listOfFeatures.contains(featureName)) {
453 listOfFeatures.add(featureName);
454 }
455 }
456 }
457 }
458 }
459 finally {
460 indexFeatureSearcher.close();
461 }
462 }
463 }
464 finally {
465 searcher.close();
466 }
467 }
468 catch(IOException ioe) {
469 ioe.printStackTrace();
470 throw new SearchException(ioe);
471 }
472 }
473 }
474 finally {
475 reader.close();
476 }
477 }
478 catch(IOException ioe) {
479 throw new SearchException(ioe);
480 }
481 return toReturn.toArray(new String[0]);
482 }
483
484 /**
485 * Gets the search parameters set by user.
486 */
487 public Map<String, Object> getParameters() {
488 return parameters;
489 }
490
491 /**
492 * A Map used for caching query tokens created for a query.
493 */
494 private Map<String, List<String>> queryTokens =
495 new HashMap<String, List<String>>();
496
497 /**
498 * Gets the query tokens for the given query.
499 *
500 * @param query
501 * @return
502 */
503 public synchronized List<String> getQueryTokens(String query) {
504 return queryTokens.get(query);
505 }
506
507 /**
508 * Adds the query tokens for the given query.
509 *
510 * @param query
511 * @param queryTokens
512 */
513 public synchronized void addQueryTokens(String query, List<String> queryTokens) {
514 this.queryTokens.put(query, queryTokens);
515 }
516
517 /**
518 * This method allow exporting results in to the provided file. This method
519 * has not been implemented yet.
520 */
521 public void exportResults(File outputFile) {
522 throw new RuntimeException("ExportResults method is not implemented yet!");
523 }
524
525 public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
526 String annotationType, String featureName, String value)
527 throws SearchException {
528
529 String indexLocation;
530 try {
531 indexLocation =
532 new File(((URL)datastore.getIndexer().getParameters().get(
533 Constants.INDEX_LOCATION_URL)).toURI()).getAbsolutePath();
534
535 }
536 catch(URISyntaxException use) {
537 indexLocation =
538 new File(((URL)datastore.getIndexer().getParameters().get(
539 Constants.INDEX_LOCATION_URL)).getFile()).getAbsolutePath();
540 }
541 IndexSearcher indexSearcher;
542 try { // open the IndexSearcher
543 indexSearcher = new IndexSearcher(indexLocation);
544 }
545 catch(IOException e) {
546 e.printStackTrace();
547 return -1;
548 }
549 int result =
550 StatsCalculator.freq(indexSearcher, corpusToSearchIn,
551 annotationSetToSearchIn, annotationType, featureName, value);
552 try { // close the IndexSearcher
553 indexSearcher.close();
554 }
555 catch(IOException ioe) {
556 ioe.printStackTrace();
557 return -1;
558 }
559 return result;
560 }
561
562 public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
563 String annotationType) throws SearchException {
564 return this.freq(corpusToSearchIn, annotationSetToSearchIn, annotationType,
565 null, null);
566 }
567
568 public int freq(String corpusToSearchIn, String annotationSetToSearchIn,
569 String annotationType, String featureName) throws SearchException {
570 return this.freq(corpusToSearchIn, annotationSetToSearchIn, annotationType,
571 featureName, null);
572 }
573
574 public int freq(List<Hit> patternsToSearchIn, String annotationType,
575 String feature, String value, boolean inMatchedSpan, boolean inContext)
576 throws SearchException {
577 return StatsCalculator.freq(patternsToSearchIn, annotationType, feature,
578 value, inMatchedSpan, inContext);
579 }
580
581 public int freq(List<Hit> patternsToSearchIn, String annotationType,
582 boolean inMatchedSpan, boolean inContext) throws SearchException {
583 return StatsCalculator.freq(patternsToSearchIn, annotationType,
584 inMatchedSpan, inContext);
585 }
586
587 public Map<String, Integer> freqForAllValues(List<Hit> patternsToSearchIn,
588 String annotationType, String feature, boolean inMatchedSpan,
589 boolean inContext) throws SearchException {
590 return StatsCalculator.freqForAllValues(patternsToSearchIn, annotationType,
591 feature, inMatchedSpan, inContext);
592 }
593
594 public void setLuceneDatastore(gate.persist.LuceneDataStoreImpl datastore) {
595 this.datastore = datastore;
596 }
597
598 }
|