001 package gate.persist;
002
003 import gate.Corpus;
004 import gate.DataStore;
005 import gate.Document;
006 import gate.Factory;
007 import gate.FeatureMap;
008 import gate.Gate;
009 import gate.LanguageResource;
010 import gate.Resource;
011 import gate.corpora.SerialCorpusImpl;
012 import gate.creole.ResourceInstantiationException;
013 import gate.creole.annic.Constants;
014 import gate.creole.annic.Hit;
015 import gate.creole.annic.IndexException;
016 import gate.creole.annic.Indexer;
017 import gate.creole.annic.SearchException;
018 import gate.creole.annic.SearchableDataStore;
019 import gate.creole.annic.Searcher;
020 import gate.creole.annic.lucene.LuceneIndexer;
021 import gate.creole.annic.lucene.LuceneSearcher;
022 import gate.event.CorpusEvent;
023 import gate.event.CorpusListener;
024 import gate.event.CreoleEvent;
025 import gate.event.CreoleListener;
026 import gate.security.SecurityException;
027 import gate.util.Files;
028 import gate.util.GateRuntimeException;
029 import gate.util.Strings;
030 import gate.util.persistence.PersistenceManager;
031
032 import java.io.BufferedReader;
033 import java.io.File;
034 import java.io.FileOutputStream;
035 import java.io.FileReader;
036 import java.io.IOException;
037 import java.io.OutputStreamWriter;
038 import java.lang.ref.ReferenceQueue;
039 import java.lang.ref.SoftReference;
040 import java.net.URL;
041 import java.util.ArrayList;
042 import java.util.Collection;
043 import java.util.HashMap;
044 import java.util.Iterator;
045 import java.util.List;
046 import java.util.Map;
047 import java.util.concurrent.ConcurrentHashMap;
048 import java.util.concurrent.ConcurrentMap;
049 import java.util.concurrent.Executors;
050 import java.util.concurrent.ScheduledThreadPoolExecutor;
051 import java.util.concurrent.TimeUnit;
052 import java.util.concurrent.atomic.AtomicBoolean;
053
054 public class LuceneDataStoreImpl extends SerialDataStore implements
055 SearchableDataStore,
056 CorpusListener,
057 CreoleListener {
058
059 /**
060 * serial version UID
061 */
062 private static final long serialVersionUID = 3618696392336421680L;
063
064 /**
065 * To store canonical lock objects for each LR ID.
066 */
067 protected Map<Object, LabelledSoftReference> lockObjects = new HashMap<Object, LabelledSoftReference>();
068
069 /**
070 * Reference queue with which the soft references in the lockObjects
071 * map will be registered.
072 */
073 protected ReferenceQueue<Object> refQueue = new ReferenceQueue<Object>();
074
075 /**
076 * Indicates if the datastore is being closed.
077 */
078 protected boolean dataStoreClosing = false;
079
080 /**
081 * Executor to run the indexing tasks
082 */
083 protected ScheduledThreadPoolExecutor executor;
084
085 /**
086 * Map keeping track of the most recent indexing task for each LR ID.
087 */
088 protected ConcurrentMap<Object, IndexingTask> currentTasks = new ConcurrentHashMap<Object, IndexingTask>();
089
090 /**
091 * Number of milliseconds we should wait after a sync before
092 * attempting to re-index a document. If sync is called again for the
093 * same document within this time then the timer for the re-indexing
094 * task is reset. Thus if several changes to the same document are
095 * made in quick succession it will only be re-indexed once. On the
096 * other hand, if the delay is set too long the document may never be
097 * indexed until the data store is closed. The default delay is 1000
098 * (one second).
099 */
100 protected long indexDelay = 1000L;
101
102 /**
103 * Indexer to be used for indexing documents
104 */
105 protected Indexer indexer;
106
107 /**
108 * Index Parameters
109 */
110 protected Map indexParameters;
111
112 /**
113 * URL of the index
114 */
115 protected URL indexURL;
116
117 /**
118 * Searcher to be used for searching the indexed documents
119 */
120 protected Searcher searcher;
121
122 /**
123 * This is where we store the search parameters
124 */
125 protected Map searchParameters;
126
127 /** Close the data store. */
128 public void close() throws PersistenceException {
129 // stop listening to Creole events
130 Gate.getCreoleRegister().removeCreoleListener(this);
131 // shut down the executor. We submit the shutdown request
132 // as a zero-delay task rather than calling shutdown directly,
133 // in order to interrupt any timed wait currently in progress.
134 executor.execute(new Runnable() {
135 public void run() {
136 executor.shutdown();
137 }
138 });
139 try {
140 // allow up to two minutes for indexing to finish
141 executor.awaitTermination(120, TimeUnit.SECONDS);
142 }
143 catch(InterruptedException e) {
144 // propagate the interruption
145 Thread.currentThread().interrupt();
146 }
147
148 // At this point, any in-progress indexing tasks have
149 // finished. We now process any tasks that were queued
150 // but not run, running them in the current thread.
151 Collection<IndexingTask> queuedTasks = currentTasks.values();
152 // copy the tasks into an array to avoid concurrent
153 // modification issues, as IndexingTask.run modifies
154 // the currentTasks map
155 IndexingTask[] queuedTasksArray = queuedTasks
156 .toArray(new IndexingTask[queuedTasks.size()]);
157 for(IndexingTask task : queuedTasksArray) {
158 task.run();
159 }
160
161 super.close();
162 } // close()
163
164 /** Open a connection to the data store. */
165 public void open() throws PersistenceException {
166 super.open();
167
168 /*
169 * check if the storage directory is a valid serial datastore if we
170 * want to support old style: String versionInVersionFile = "1.0";
171 * (but this means it will open *any* directory)
172 */
173 try {
174 FileReader fis = new FileReader(getVersionFile());
175 BufferedReader isr = new BufferedReader(fis);
176 currentProtocolVersion = isr.readLine();
177 String indexDirRelativePath = isr.readLine();
178
179 if(indexDirRelativePath != null
180 && indexDirRelativePath.trim().length() > 1) {
181 URL storageDirURL = storageDir.toURI().toURL();
182 URL theIndexURL = new URL(storageDirURL, indexDirRelativePath);
183 // check if index directory exists
184 File indexDir = Files.fileFromURL(theIndexURL);
185 if(!indexDir.exists()) {
186 throw new PersistenceException("Index directory "
187 + indexDirRelativePath
188 + " could not be found for datastore at "
189 + storageDirURL);
190 }
191
192 indexURL = theIndexURL;
193 this.indexer = new LuceneIndexer(indexURL);
194 this.searcher = new LuceneSearcher();
195 ((LuceneSearcher)this.searcher).setLuceneDatastore(this);
196 }
197 isr.close();
198 }
199 catch(IOException e) {
200 throw new PersistenceException("Invalid storage directory: " + e);
201 }
202 if(!isValidProtocolVersion(currentProtocolVersion))
203 throw new PersistenceException("Invalid protocol version number: "
204 + currentProtocolVersion);
205
206 // Lets create a separate indexer thread which keeps running in the
207 // background
208 executor = new ScheduledThreadPoolExecutor(1, Executors
209 .defaultThreadFactory());
210 // set up the executor so it does not execute delayed indexing tasks
211 // that are still waiting when it is shut down. We run these tasks
212 // immediately at shutdown time rather than waiting.
213 executor.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
214 executor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
215 // start listening to Creole events
216 Gate.getCreoleRegister().addCreoleListener(this);
217 }
218
219 /**
220 * Obtain the lock object on which we must synchronize when loading or
221 * saving the LR with the given ID.
222 *
223 * @param id
224 * @return
225 */
226 private Object lockObjectForID(Object id) {
227 synchronized(lockObjects) {
228 processRefQueue();
229 Object lock = null;
230 if(lockObjects.containsKey(id)) {
231 lock = lockObjects.get(id).get();
232 }
233 if(lock == null) {
234 lockObjects.remove(id);
235 lock = new Object();
236 LabelledSoftReference ref = new LabelledSoftReference(lock);
237 ref.label = id;
238 lockObjects.put(id, ref);
239 }
240
241 return lock;
242 }
243 }
244
245 /**
246 * Cleans up the lockObjects map by removing any entries whose
247 * SoftReference values have been cleared by the garbage collector.
248 */
249 private void processRefQueue() {
250 LabelledSoftReference ref = null;
251 while((ref = LabelledSoftReference.class.cast(refQueue.poll())) != null) {
252 // check that the queued ref hasn't already been replaced in the
253 // map
254 if(lockObjects.get(ref.label) == ref) {
255 lockObjects.remove(ref.label);
256 }
257 }
258 }
259
260 /**
261 * Submits the given LR ID for indexing. The task is delayed by 5
262 * seconds, so multiple updates to the same LR in close succession do
263 * not un-necessarily trigger multiple re-indexing passes.
264 */
265 protected void queueForIndexing(Object lrID) {
266 IndexingTask existingTask = currentTasks.get(lrID);
267 if(existingTask != null) {
268 existingTask.disable();
269 }
270
271 IndexingTask newTask = new IndexingTask(lrID);
272 currentTasks.put(lrID, newTask);
273 // set the LR to be indexed after the configured delay
274 executor.schedule(newTask, indexDelay, TimeUnit.MILLISECONDS);
275 }
276
277 /**
278 * Delete a resource from the data store.
279 */
280 public void delete(String lrClassName, Object lrPersistenceId)
281 throws PersistenceException {
282
283 IndexingTask task = currentTasks.get(lrPersistenceId);
284 if(task != null) {
285 task.disable();
286 }
287
288 // and we delete it from the datastore
289 // we obtained the lock on this - in order to avoid clashing between
290 // the object being loaded by the indexer thread and the thread that
291 // deletes it
292 Object lock = lockObjectForID(lrPersistenceId);
293 synchronized(lock) {
294 super.delete(lrClassName, lrPersistenceId);
295 }
296 lock = null;
297
298 /*
299 * lets first find out if the deleted resource is a corpus. Deleting
300 * a corpus does not require deleting all its member documents but
301 * we need to remove the reference of corpus from all its underlying
302 * documents in index
303 */
304 try {
305 if(Corpus.class.isAssignableFrom(Class.forName(lrClassName, true, Gate
306 .getClassLoader()))) {
307 /*
308 * we would issue a search query to obtain all documents which
309 * belong to his corpus and set them as referring to null
310 * instead of refering to the given corpus
311 */
312 Map<String, Object> parameters = new HashMap<String, Object>();
313 parameters.put(Constants.INDEX_LOCATION_URL, indexURL);
314 parameters.put(Constants.CORPUS_ID, lrPersistenceId.toString());
315 try {
316 boolean success = getSearcher().search("nothing", parameters);
317 if(!success) return;
318
319 Hit[] hits = getSearcher().next(-1);
320 if(hits == null || hits.length == 0) {
321 // do nothing
322 return;
323 }
324
325 for(int i = 0; i < hits.length; i++) {
326 String docID = hits[i].getDocumentID();
327 queueForIndexing(docID);
328 }
329 }
330 catch(SearchException se) {
331 throw new PersistenceException(se);
332 }
333 return;
334 }
335 }
336 catch(ClassNotFoundException cnfe) {
337 // don't do anything
338 }
339
340 // we want to delete this document from the Index as well
341 ArrayList<Object> removed = new ArrayList<Object>();
342 removed.add(lrPersistenceId);
343 try {
344 synchronized(indexer) {
345 this.indexer.remove(removed);
346 }
347 }
348 catch(IndexException ie) {
349 throw new PersistenceException(ie);
350 }
351 }
352
353 /**
354 * Get a resource from the persistent store. <B>Don't use this method
355 * - use Factory.createResource with DataStore and DataStoreInstanceId
356 * parameters set instead.</B> (Sometimes I wish Java had "friend"
357 * declarations...)
358 */
359 public LanguageResource getLr(String lrClassName, Object lrPersistenceId)
360 throws PersistenceException, SecurityException {
361 LanguageResource lr = super.getLr(lrClassName, lrPersistenceId);
362 if(lr instanceof Corpus) {
363 ((Corpus)lr).addCorpusListener(this);
364 }
365 return lr;
366 }
367
368 /**
369 * Save: synchonise the in-memory image of the LR with the persistent
370 * image.
371 */
372 public void sync(LanguageResource lr) throws PersistenceException {
373 if(lr.getLRPersistenceId() != null) {
374 // lock the LR ID so we don't write to the file while an
375 // indexer task is reading it
376 Object lock = lockObjectForID(lr.getLRPersistenceId());
377 synchronized(lock) {
378 super.sync(lr);
379 }
380 lock = null;
381 }
382 else {
383 super.sync(lr);
384 }
385
386 if(lr instanceof Document) {
387 queueForIndexing(lr.getLRPersistenceId());
388 }
389 }
390
391 /**
392 * Sets the Indexer to be used for indexing Datastore
393 */
394 public void setIndexer(Indexer indexer, Map indexParameters)
395 throws IndexException {
396
397 this.indexer = indexer;
398 this.indexParameters = indexParameters;
399 this.indexURL = (URL)this.indexParameters.get(Constants.INDEX_LOCATION_URL);
400 this.indexer.createIndex(this.indexParameters);
401
402 // dump the version file
403 try {
404 File versionFile = getVersionFile();
405 OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(
406 versionFile));
407 osw.write(versionNumber + Strings.getNl());
408 String indexDirRelativePath = PersistenceManager.getRelativePath(
409 storageDir.toURI().toURL(), indexURL);
410 osw.write(indexDirRelativePath);
411 osw.close();
412 }
413 catch(IOException e) {
414 throw new IndexException("couldn't write version file: " + e);
415 }
416 }
417
418 public Indexer getIndexer() {
419 return this.indexer;
420 }
421
422 public void setSearcher(Searcher searcher) throws SearchException {
423 this.searcher = searcher;
424 if(this.searcher instanceof LuceneSearcher) {
425 ((LuceneSearcher)this.searcher).setLuceneDatastore(this);
426 }
427 }
428
429 public Searcher getSearcher() {
430 return this.searcher;
431 }
432
433 /**
434 * Sets the delay in milliseconds that we should wait after a sync
435 * before attempting to re-index a document. If sync is called again
436 * for the same document within this time then the timer for the
437 * re-indexing task is reset. Thus if several changes to the same
438 * document are made in quick succession it will only be re-indexed
439 * once. On the other hand, if the delay is set too long the document
440 * may never be indexed until the data store is closed. The default
441 * delay is 1000ms (one second), which should be appropriate for usage
442 * in the GATE GUI.
443 */
444 public void setIndexDelay(long indexDelay) {
445 this.indexDelay = indexDelay;
446 }
447
448 public long getIndexDelay() {
449 return indexDelay;
450 }
451
452 /**
453 * Search the datastore
454 */
455 public boolean search(String query, Map searchParameters)
456 throws SearchException {
457 return this.searcher.search(query, searchParameters);
458 }
459
460 /**
461 * Returns the next numberOfPatterns
462 *
463 * @param numberOfPatterns
464 * @return null if no patterns found
465 */
466 public Hit[] next(int numberOfPatterns) throws SearchException {
467 return this.searcher.next(numberOfPatterns);
468 }
469
470 // Corpus Events
471 /**
472 * This method is invoked whenever a document is removed from a corpus
473 */
474 public void documentRemoved(CorpusEvent ce) {
475 Object docLRID = ce.getDocumentLRID();
476
477 /*
478 * we need to remove this document from the index
479 */
480 if(docLRID != null) {
481 ArrayList<Object> removed = new ArrayList<Object>();
482 removed.add(docLRID);
483 try {
484 synchronized(indexer) {
485 indexer.remove(removed);
486 }
487 }
488 catch(IndexException ie) {
489 throw new GateRuntimeException(ie);
490 }
491 // queueForIndexing(docLRID);
492 }
493 }
494
495 /**
496 * This method is invoked whenever a document is added to a particular
497 * corpus
498 */
499 public void documentAdded(CorpusEvent ce) {
500 /*
501 * we don't want to do anything here, because the sync is
502 * automatically called when a document is added to a corpus which
503 * is part of the the datastore
504 */
505 }
506
507 /*
508 * (non-Javadoc)
509 *
510 * @see
511 * gate.event.CreoleListener#datastoreClosed(gate.event.CreoleEvent)
512 */
513 public void datastoreClosed(CreoleEvent e) {
514 }
515
516 /*
517 * (non-Javadoc)
518 *
519 * @see
520 * gate.event.CreoleListener#datastoreCreated(gate.event.CreoleEvent)
521 */
522 public void datastoreCreated(CreoleEvent e) {
523 }
524
525 /*
526 * (non-Javadoc)
527 *
528 * @see
529 * gate.event.CreoleListener#datastoreOpened(gate.event.CreoleEvent)
530 */
531 public void datastoreOpened(CreoleEvent e) {
532 }
533
534 /*
535 * (non-Javadoc)
536 *
537 * @see
538 * gate.event.CreoleListener#resourceLoaded(gate.event.CreoleEvent)
539 */
540 public void resourceLoaded(CreoleEvent e) {
541 }
542
543 /*
544 * (non-Javadoc)
545 *
546 * @see gate.event.CreoleListener#resourceRenamed(gate.Resource,
547 * java.lang.String, java.lang.String)
548 */
549 public void resourceRenamed(Resource resource, String oldName, String newName) {
550 }
551
552 /*
553 * (non-Javadoc)
554 *
555 * @see
556 * gate.event.CreoleListener#resourceUnloaded(gate.event.CreoleEvent)
557 */
558 public void resourceUnloaded(CreoleEvent e) {
559 // if the resource being close is one of our corpora. we need to
560 // remove
561 // the corpus listener associated with it
562 Resource res = e.getResource();
563 if(res instanceof Corpus) {
564 ((Corpus)res).removeCorpusListener(this);
565 }
566 }
567
568 protected class IndexingTask implements Runnable {
569 private AtomicBoolean disabled = new AtomicBoolean(false);
570
571 private Object lrID;
572
573 public IndexingTask(Object lrID) {
574 this.lrID = lrID;
575 }
576
577 public void disable() {
578 disabled.set(true);
579 }
580
581 public void run() {
582 // remove this task from the currentTasks map if it has not been
583 // superseded by a later task
584 currentTasks.remove(lrID, this);
585 // only run the rest of the process if this task has not been
586 // disabled (because a newer task for the same LR was scheduled).
587 // We set the disabled flag at this point so the same task cannot
588 // be run twice.
589 if(disabled.compareAndSet(false, true)) {
590 Document doc = null;
591 // read the document from datastore
592 FeatureMap features = Factory.newFeatureMap();
593 features.put(DataStore.LR_ID_FEATURE_NAME, lrID);
594 features
595 .put(DataStore.DATASTORE_FEATURE_NAME, LuceneDataStoreImpl.this);
596 FeatureMap hidefeatures = Factory.newFeatureMap();
597 Gate.setHiddenAttribute(hidefeatures, true);
598 try {
599 // lock the LR ID so we don't try and read a file
600 // which is in the process of being written
601 Object lock = lockObjectForID(lrID);
602 synchronized(lock) {
603 doc = (Document)Factory.createResource("gate.corpora.DocumentImpl",
604 features, hidefeatures);
605 }
606 lock = null;
607 }
608 catch(ResourceInstantiationException rie) {
609 // this means the LR ID was null
610 doc = null;
611 }
612
613 // if the document is not null,
614 // proceed to indexing it
615 if(doc != null) {
616
617 /*
618 * we need to reindex this document in order to synchronize it
619 * lets first remove it from the index
620 */
621 ArrayList<Object> removed = new ArrayList<Object>();
622 removed.add(lrID);
623 try {
624 synchronized(indexer) {
625 indexer.remove(removed);
626 }
627 }
628 catch(IndexException ie) {
629 throw new GateRuntimeException(ie);
630 }
631
632 // and add it back
633 ArrayList<Document> added = new ArrayList<Document>();
634 added.add(doc);
635
636 try {
637 String corpusPID = null;
638
639 /*
640 * we need to find out the corpus which this document
641 * belongs to one easy way is to check all instances of
642 * serial corpus loaded in memory
643 */
644 List scs = Gate.getCreoleRegister().getLrInstances(
645 SerialCorpusImpl.class.getName());
646 if(scs != null) {
647 /*
648 * we need to check which corpus the deleted class
649 * belonged to
650 */
651 Iterator iter = scs.iterator();
652 while(iter.hasNext()) {
653 SerialCorpusImpl sci = (SerialCorpusImpl)iter.next();
654 if(sci != null) {
655 if(sci.contains(doc)) {
656 corpusPID = sci.getLRPersistenceId().toString();
657 break;
658 }
659 }
660 }
661 }
662
663 /*
664 * it is also possible that the document is loaded from
665 * datastore without being loaded from the corpus (e.g.
666 * using getLR(...) method of datastore) in this case the
667 * relevant corpus won't exist in memory
668 */
669 if(corpusPID == null) {
670 List corpusPIDs = getLrIds(SerialCorpusImpl.class.getName());
671 if(corpusPIDs != null) {
672 for(int i = 0; i < corpusPIDs.size(); i++) {
673 Object corpusID = corpusPIDs.get(i);
674
675 SerialCorpusImpl corpusLR = null;
676 // we will have to load this corpus
677 FeatureMap params = Factory.newFeatureMap();
678 params.put(DataStore.DATASTORE_FEATURE_NAME,
679 LuceneDataStoreImpl.this);
680 params.put(DataStore.LR_ID_FEATURE_NAME, corpusID);
681 hidefeatures = Factory.newFeatureMap();
682 Gate.setHiddenAttribute(hidefeatures, true);
683 Object lock = lockObjectForID(corpusID);
684 synchronized(lock) {
685 corpusLR = (SerialCorpusImpl)Factory.createResource(
686 SerialCorpusImpl.class.getCanonicalName(), params,
687 hidefeatures);
688 }
689 lock = null;
690
691 if(corpusLR != null) {
692 if(corpusLR.contains(doc)) {
693 corpusPID = corpusLR.getLRPersistenceId().toString();
694 }
695 Factory.deleteResource(corpusLR);
696 if(corpusPID != null) break;
697 }
698 }
699 }
700 }
701
702 synchronized(indexer) {
703 indexer.add(corpusPID, added);
704 }
705
706 Factory.deleteResource(doc);
707 }
708 catch(Exception ie) {
709 ie.printStackTrace();
710 }
711 }
712 }
713 }
714
715 }
716
717 /**
718 * Soft reference with an associated label.
719 */
720 private class LabelledSoftReference extends SoftReference<Object> {
721 Object label;
722
723 public LabelledSoftReference(Object referent) {
724 super(referent);
725 }
726 }
727 }
|