0001 /*
0002 * SerialCorpusImpl.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Kalina Bontcheva, 19/Oct/2001
0013 *
0014 * $Id: SerialCorpusImpl.java 13518 2011-03-09 18:35:34Z nirajaswani $
0015 */
0016
0017 package gate.corpora;
0018
0019 import gate.Corpus;
0020 import gate.DataStore;
0021 import gate.Document;
0022 import gate.Factory;
0023 import gate.FeatureMap;
0024 import gate.Gate;
0025 import gate.GateConstants;
0026 import gate.Resource;
0027 import gate.creole.AbstractLanguageResource;
0028 import gate.creole.CustomDuplication;
0029 import gate.creole.ResourceInstantiationException;
0030 import gate.creole.ir.IREngine;
0031 import gate.creole.ir.IndexDefinition;
0032 import gate.creole.ir.IndexException;
0033 import gate.creole.ir.IndexManager;
0034 import gate.creole.ir.IndexStatistics;
0035 import gate.creole.ir.IndexedCorpus;
0036 import gate.creole.metadata.CreoleResource;
0037 import gate.event.CorpusEvent;
0038 import gate.event.CorpusListener;
0039 import gate.event.CreoleEvent;
0040 import gate.event.CreoleListener;
0041 import gate.event.DatastoreEvent;
0042 import gate.event.DatastoreListener;
0043 import gate.persist.PersistenceException;
0044 import gate.security.SecurityException;
0045 import gate.util.Err;
0046 import gate.util.GateRuntimeException;
0047 import gate.util.MethodNotImplementedException;
0048 import gate.util.Out;
0049
0050 import java.io.FileFilter;
0051 import java.io.IOException;
0052 import java.io.ObjectInputStream;
0053 import java.net.URL;
0054 import java.util.ArrayList;
0055 import java.util.Collection;
0056 import java.util.Iterator;
0057 import java.util.List;
0058 import java.util.ListIterator;
0059 import java.util.Vector;
0060
0061 // The initial design was to implement this on the basis of a WeakValueHashMap.
0062 // However this creates problems, because the user might e.g., add a transient
0063 // document to the corpus and then if the Document variable goes out of scope
0064 // before sync() is called, nothing will be saved of the new document. Bad!
0065 // Instead, to cope with the unloading for memory saving use, I implemented
0066 // a documentUnload() method, which sets the in-memory copy to null but can
0067 // always restore the doc, because it has its persistence ID.
0068
0069 @CreoleResource(name = "GATE Serial Corpus", isPrivate = true, comment = "GATE persistent corpus (serialisation)", icon = "corpus", helpURL = "http://gate.ac.uk/userguide/sec:developer:datastores")
0070 public class SerialCorpusImpl extends AbstractLanguageResource
0071 implements
0072 Corpus,
0073 CreoleListener,
0074 DatastoreListener,
0075 IndexedCorpus,
0076 CustomDuplication {
0077
0078 /** Debug flag */
0079 private static final boolean DEBUG = false;
0080
0081 static final long serialVersionUID = 3632609241787241616L;
0082
0083 protected transient Vector corpusListeners;
0084
0085 protected java.util.List docDataList = null;
0086
0087 // here I keep document index as key (same as the index in docDataList
0088 // which defines the document order) and Documents as value
0089 protected transient List documents = null;
0090
0091 protected transient IndexManager indexManager = null;
0092
0093 protected transient List addedDocs = null;
0094
0095 protected transient List removedDocIDs = null;
0096
0097 protected transient List changedDocs = null;
0098
0099 public SerialCorpusImpl() {
0100 }
0101
0102 /**
0103 * Constructor to create a SerialCorpus from a transient one. This is
0104 * called by adopt() to store the transient corpus and re-route the
0105 * methods calls to it, until the corpus is sync-ed on disk. After
0106 * that, the transientCorpus will always be null, so the new
0107 * functionality will be used instead.
0108 */
0109 protected SerialCorpusImpl(Corpus tCorpus) {
0110 // copy the corpus name and features from the one in memory
0111 this.setName(tCorpus.getName());
0112 this.setFeatures(tCorpus.getFeatures());
0113
0114 docDataList = new ArrayList();
0115 // now cache the names of all docs for future use
0116 List docNames = tCorpus.getDocumentNames();
0117 for(int i = 0; i < docNames.size(); i++) {
0118 Document doc = (Document)tCorpus.get(i);
0119 docDataList.add(new DocumentData((String)docNames.get(i), null, doc
0120 .getClass().getName()));
0121 }
0122
0123 // copy all the documents from the transient corpus
0124 documents = new ArrayList();
0125 documents.addAll(tCorpus);
0126
0127 // make sure we fire events when docs are added/removed/etc
0128 Gate.getCreoleRegister().addCreoleListener(this);
0129 }
0130
0131 /**
0132 * Gets the names of the documents in this corpus.
0133 *
0134 * @return a {@link List} of Strings representing the names of the
0135 * documents in this corpus.
0136 */
0137 public List<String> getDocumentNames() {
0138 List<String> docsNames = new ArrayList<String>();
0139 if(docDataList == null) return docsNames;
0140 for(Object aDocDataList : docDataList) {
0141 DocumentData data = (DocumentData)aDocDataList;
0142 docsNames.add(data.getDocumentName());
0143 }
0144 return docsNames;
0145 }
0146
0147 /**
0148 * Gets the persistent IDs of the documents in this corpus.
0149 *
0150 * @return a {@link List} of Objects representing the persistent IDs
0151 * of the documents in this corpus.
0152 */
0153 public List getDocumentPersistentIDs() {
0154 List docsIDs = new ArrayList();
0155 if(docDataList == null) return docsIDs;
0156 Iterator iter = docDataList.iterator();
0157 while(iter.hasNext()) {
0158 DocumentData data = (DocumentData)iter.next();
0159 docsIDs.add(data.getPersistentID());
0160 }
0161 return docsIDs;
0162 }
0163
0164 /**
0165 * Gets the persistent IDs of the documents in this corpus.
0166 *
0167 * @return a {@link List} of Objects representing the persistent IDs
0168 * of the documents in this corpus.
0169 */
0170 public List getDocumentClassTypes() {
0171 List docsIDs = new ArrayList();
0172 if(docDataList == null) return docsIDs;
0173 Iterator iter = docDataList.iterator();
0174 while(iter.hasNext()) {
0175 DocumentData data = (DocumentData)iter.next();
0176 docsIDs.add(data.getClassType());
0177 }
0178 return docsIDs;
0179 }
0180
0181 /**
0182 * This method should only be used by the Serial Datastore to set
0183 */
0184 public void setDocumentPersistentID(int index, Object persID) {
0185 if(index >= docDataList.size()) return;
0186 ((DocumentData)docDataList.get(index)).setPersistentID(persID);
0187 if(DEBUG) Out.prln("IDs are now: " + docDataList);
0188 }
0189
0190 /**
0191 * Gets the name of a document in this corpus.
0192 *
0193 * @param index the index of the document
0194 * @return a String value representing the name of the document at
0195 * <tt>index</tt> in this corpus.
0196 * <P>
0197 */
0198 public String getDocumentName(int index) {
0199 if(index >= docDataList.size()) return "No such document";
0200
0201 return ((DocumentData)docDataList.get(index)).getDocumentName();
0202 }
0203
0204 /**
0205 * Gets the persistent ID of a document in this corpus.
0206 *
0207 * @param index the index of the document
0208 * @return a value representing the persistent ID of the document at
0209 * <tt>index</tt> in this corpus.
0210 * <P>
0211 */
0212 public Object getDocumentPersistentID(int index) {
0213 if(index >= docDataList.size()) return null;
0214 return ((DocumentData)docDataList.get(index)).getPersistentID();
0215 }
0216
0217 public String getDocumentClassType(int index) {
0218 if(index >= docDataList.size()) return null;
0219 return ((DocumentData)docDataList.get(index)).getClassType();
0220 }
0221
0222 /**
0223 * Unloads a document from memory.
0224 *
0225 * @param index the index of the document to be unloaded.
0226 * @param sync should the document be sync'ed (i.e. saved) before
0227 * unloading.
0228 */
0229 public void unloadDocument(int index, boolean sync) {
0230 // 1. check whether its been loaded and is a persistent one
0231 // if a persistent doc is not loaded, there's nothing we need to do
0232 if((!isDocumentLoaded(index)) && isPersistentDocument(index)) return;
0233 // 2. If requested, sync the document before releasing it from
0234 // memory,
0235 // because the creole register garbage collects all LRs which are
0236 // not used
0237 // any more
0238 if(sync) {
0239 Document doc = (Document)documents.get(index);
0240 try {
0241 // if the document is not already adopted, we need to do that
0242 // first
0243 if(doc.getLRPersistenceId() == null) {
0244 doc = (Document)this.getDataStore().adopt(doc, null);
0245 this.getDataStore().sync(doc);
0246 this.setDocumentPersistentID(index, doc.getLRPersistenceId());
0247 }
0248 else // if it is adopted, just sync it
0249 this.getDataStore().sync(doc);
0250 }
0251 catch(PersistenceException ex) {
0252 throw new GateRuntimeException("Error unloading document from corpus"
0253 + "because document sync failed: " + ex.getMessage());
0254 }
0255 catch(gate.security.SecurityException ex1) {
0256 throw new GateRuntimeException("Error unloading document from corpus"
0257 + "because of document access error: " + ex1.getMessage());
0258 }
0259 }
0260 // 3. remove the document from the memory
0261 // do this, only if the saving has succeeded
0262 documents.set(index, null);
0263 }
0264
0265 /**
0266 * Unloads a document from memory
0267 *
0268 * @param doc the document to be unloaded
0269 * @param sync should the document be sync'ed (i.e. saved) before
0270 * unloading.
0271 */
0272 public void unloadDocument(Document doc, boolean sync) {
0273 if(DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
0274 // 1. determine the index of the document; if not there, do nothing
0275 int index = findDocument(doc);
0276 if(index == -1) return;
0277 if(DEBUG) Out.prln("Index of doc: " + index);
0278 if(DEBUG) Out.prln("Size of corpus: " + documents.size());
0279 unloadDocument(index, sync);
0280 // documents.remove(new Integer(index));
0281 }
0282
0283 /**
0284 * Unloads a document from memory, calling sync() first, to store the
0285 * changes.
0286 *
0287 * @param doc the document to be unloaded.
0288 */
0289 public void unloadDocument(Document doc) {
0290 unloadDocument(doc, true);
0291 }
0292
0293 /**
0294 * Unloads the document from memory, calling sync() first, to store
0295 * the changes.
0296 *
0297 * @param index the index of the document to be unloaded.
0298 */
0299 public void unloadDocument(int index) {
0300 unloadDocument(index, true);
0301 }
0302
0303 /**
0304 * This method returns true when the document is already loaded in
0305 * memory
0306 */
0307 public boolean isDocumentLoaded(int index) {
0308 if(documents == null || documents.isEmpty()) return false;
0309 return documents.get(index) != null;
0310 }
0311
0312 /**
0313 * This method returns true when the document is already stored on
0314 * disk i.e., is not transient
0315 */
0316 public boolean isPersistentDocument(int index) {
0317 if(documents == null || documents.isEmpty()) return false;
0318 return (((DocumentData)docDataList.get(index)).getPersistentID() != null);
0319 }
0320
0321 /**
0322 * Every LR that is a CreoleListener (and other Listeners too) must
0323 * override this method and make sure it removes itself from the
0324 * objects which it has been listening to. Otherwise, the object will
0325 * not be released from memory (memory leak!).
0326 */
0327 public void cleanup() {
0328 if(DEBUG) Out.prln("serial corpus cleanup called");
0329 if(corpusListeners != null) corpusListeners = null;
0330 if(documents != null) documents.clear();
0331 docDataList.clear();
0332 Gate.getCreoleRegister().removeCreoleListener(this);
0333 if(this.dataStore != null) {
0334 this.dataStore.removeDatastoreListener(this);
0335 }
0336 }
0337
0338 /**
0339 * Fills this corpus with documents created from files in a directory.
0340 *
0341 * @param filter the file filter used to select files from the target
0342 * directory. If the filter is <tt>null</tt> all the files
0343 * will be accepted.
0344 * @param directory the directory from which the files will be picked.
0345 * This parameter is an URL for uniformity. It needs to be a
0346 * URL of type file otherwise an InvalidArgumentException
0347 * will be thrown. An implementation for this method is
0348 * provided as a static method at
0349 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
0350 * .
0351 * @param encoding the encoding to be used for reading the documents
0352 * @param recurseDirectories should the directory be parsed
0353 * recursively?. If <tt>true</tt> all the files from the
0354 * provided directory and all its children directories (on as
0355 * many levels as necessary) will be picked if accepted by
0356 * the filter otherwise the children directories will be
0357 * ignored.
0358 */
0359 public void populate(URL directory, FileFilter filter, String encoding,
0360 boolean recurseDirectories) throws IOException,
0361 ResourceInstantiationException {
0362 CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories);
0363 }
0364
0365 /**
0366 * Fills this corpus with documents created from files in a directory.
0367 *
0368 * @param filter the file filter used to select files from the target
0369 * directory. If the filter is <tt>null</tt> all the files
0370 * will be accepted.
0371 * @param directory the directory from which the files will be picked.
0372 * This parameter is an URL for uniformity. It needs to be a
0373 * URL of type file otherwise an InvalidArgumentException
0374 * will be thrown. An implementation for this method is
0375 * provided as a static method at
0376 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
0377 * .
0378 * @param encoding the encoding to be used for reading the documents
0379 * @param recurseDirectories should the directory be parsed
0380 * recursively?. If <tt>true</tt> all the files from the
0381 * provided directory and all its children directories (on as
0382 * many levels as necessary) will be picked if accepted by
0383 * the filter otherwise the children directories will be
0384 * ignored.
0385 */
0386 public void populate(URL directory, FileFilter filter, String encoding,
0387 String mimeType, boolean recurseDirectories) throws IOException,
0388 ResourceInstantiationException {
0389 CorpusImpl.populate(this, directory, filter, encoding, mimeType,
0390 recurseDirectories);
0391 }
0392
0393 /**
0394 * Fills the provided corpus with documents extracted from the
0395 * provided single concatenated file.
0396 *
0397 * @param trecFile the trec file.
0398 * @param documentRootElement content between the start and end of
0399 * this element is considered for documents.
0400 * @param encoding the encoding of the trec file.
0401 * @param numberOfFilesToExtract indicates the number of files to
0402 * extract from the trecweb file.
0403 * @param documentNamePrefix the prefix to use for document names when
0404 * creating from
0405 * @param documentType type of the document it is (i.e. xml or html
0406 * etc.)
0407 * @return total length of populated documents in the corpus in number
0408 * of bytes
0409 */
0410 public long populate(URL singleConcatenatedFile, String documentRootElement,
0411 String encoding, int numberOfFilesToExtract,
0412 String documentNamePrefix, DocType documentType) throws IOException,
0413 ResourceInstantiationException {
0414 return CorpusImpl.populate(this, singleConcatenatedFile,
0415 documentRootElement, encoding, numberOfFilesToExtract,
0416 documentNamePrefix, documentType);
0417 }
0418
0419 public synchronized void removeCorpusListener(CorpusListener l) {
0420 if(corpusListeners != null && corpusListeners.contains(l)) {
0421 Vector v = (Vector)corpusListeners.clone();
0422 v.removeElement(l);
0423 corpusListeners = v;
0424 }
0425 }
0426
0427 public synchronized void addCorpusListener(CorpusListener l) {
0428 Vector v = corpusListeners == null
0429 ? new Vector(2)
0430 : (Vector)corpusListeners.clone();
0431 if(!v.contains(l)) {
0432 v.addElement(l);
0433 corpusListeners = v;
0434 }
0435 }
0436
0437 protected void fireDocumentAdded(CorpusEvent e) {
0438 if(corpusListeners != null) {
0439 Vector listeners = corpusListeners;
0440 int count = listeners.size();
0441 for(int i = 0; i < count; i++) {
0442 ((CorpusListener)listeners.elementAt(i)).documentAdded(e);
0443 }
0444 }
0445 }
0446
0447 protected void fireDocumentRemoved(CorpusEvent e) {
0448 if(corpusListeners != null) {
0449 Vector listeners = corpusListeners;
0450 int count = listeners.size();
0451 for(int i = 0; i < count; i++) {
0452 ((CorpusListener)listeners.elementAt(i)).documentRemoved(e);
0453 }
0454 }
0455 }
0456
0457 public void resourceLoaded(CreoleEvent e) {
0458 }
0459
0460 public void resourceRenamed(Resource resource, String oldName, String newName) {
0461 }
0462
0463 public void resourceUnloaded(CreoleEvent e) {
0464 Resource res = e.getResource();
0465 if(res instanceof Document) {
0466 Document doc = (Document)res;
0467 if(DEBUG) Out.prln("resource Unloaded called ");
0468 // remove from the corpus too, if a transient one
0469 if(doc.getDataStore() != this.getDataStore()) {
0470 this.remove(doc);
0471 }
0472 else {
0473 // unload all occurences
0474 int index = indexOf(res);
0475 if(index < 0) return;
0476 documents.set(index, null);
0477 if(DEBUG)
0478 Out.prln("corpus: document " + index + " unloaded and set to null");
0479 } // if
0480 }
0481 }
0482
0483 public void datastoreOpened(CreoleEvent e) {
0484 }
0485
0486 public void datastoreCreated(CreoleEvent e) {
0487 }
0488
0489 public void datastoreClosed(CreoleEvent e) {
0490 if(!e.getDatastore().equals(this.getDataStore())) return;
0491 if(this.getDataStore() != null)
0492 this.getDataStore().removeDatastoreListener(this);
0493 // close this corpus, since it cannot stay open when the DS it comes
0494 // from
0495 // is closed
0496 Factory.deleteResource(this);
0497 }
0498
0499 /**
0500 * Called by a datastore when a new resource has been adopted
0501 */
0502 public void resourceAdopted(DatastoreEvent evt) {
0503 }
0504
0505 /**
0506 * Called by a datastore when a resource has been deleted
0507 */
0508 public void resourceDeleted(DatastoreEvent evt) {
0509 DataStore ds = (DataStore)evt.getSource();
0510 // 1. check whether this datastore fired the event. If not, return.
0511 if(!ds.equals(this.dataStore)) return;
0512
0513 Object docID = evt.getResourceID();
0514 if(docID == null) return;
0515
0516 if(DEBUG) Out.prln("Resource deleted called for: " + docID);
0517 // first check if it is this corpus that's been deleted, it must be
0518 // unloaded immediately
0519 if(docID.equals(this.getLRPersistenceId())) {
0520 Factory.deleteResource(this);
0521 return;
0522 }// if
0523
0524 boolean isDirty = false;
0525 // the problem here is that I only have the doc persistent ID
0526 // and nothing else, so I need to determine the index of the doc
0527 // first
0528 for(int i = 0; i < docDataList.size(); i++) {
0529 DocumentData docData = (DocumentData)docDataList.get(i);
0530 // we've found the correct document
0531 // don't break the loop, because it might appear more than once
0532 if(docID.equals(docData.getPersistentID())) {
0533 if(evt.getResource() == null) {
0534 // instead of calling remove() which tries to load the
0535 // document
0536 // remove it from the documents and docDataList
0537 documentRemoved(((DocumentData)docDataList.get(i)).persistentID
0538 .toString());
0539 docDataList.remove(i);
0540 documents.remove(i);
0541 isDirty = true;
0542 i--;
0543 continue;
0544 }
0545
0546 remove(i);
0547 isDirty = true;
0548 }// if
0549 }// for loop through the doc data
0550
0551 if(isDirty) try {
0552 this.dataStore.sync(this);
0553 }
0554 catch(PersistenceException ex) {
0555 throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
0556 }
0557 catch(SecurityException sex) {
0558 throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
0559 }
0560 }// resourceDeleted
0561
0562 /**
0563 * Called by a datastore when a resource has been wrote into the
0564 * datastore
0565 */
0566 public void resourceWritten(DatastoreEvent evt) {
0567 if(evt.getResourceID().equals(this.getLRPersistenceId())) {
0568 thisResourceWritten();
0569 }
0570 }
0571
0572 // List methods
0573 // java docs will be automatically copied from the List interface.
0574
0575 public int size() {
0576 return docDataList.size();
0577 }
0578
0579 public boolean isEmpty() {
0580 return docDataList.isEmpty();
0581 }
0582
0583 public boolean contains(Object o) {
0584 // return true if:
0585 // - the document data list contains a document with such a name
0586 // and persistent id
0587
0588 if(!(o instanceof Document)) return false;
0589
0590 int index = findDocument((Document)o);
0591 if(index < 0)
0592 return false;
0593 else return true;
0594 }
0595
0596 public Iterator iterator() {
0597 return new Iterator() {
0598 Iterator docDataIter = docDataList.iterator();
0599
0600 public boolean hasNext() {
0601 return docDataIter.hasNext();
0602 }
0603
0604 public Object next() {
0605
0606 // try finding a document with the same name and persistent ID
0607 DocumentData docData = (DocumentData)docDataIter.next();
0608 int index = docDataList.indexOf(docData);
0609 return SerialCorpusImpl.this.get(index);
0610 }
0611
0612 public void remove() {
0613 throw new UnsupportedOperationException("SerialCorpusImpl does not "
0614 + "support remove in the iterators");
0615 }
0616 }; // return
0617
0618 }// iterator
0619
0620 public String toString() {
0621 return "document data " + docDataList.toString() + " documents "
0622 + documents;
0623 }
0624
0625 public Object[] toArray() {
0626 // there is a problem here, because some docs might not be
0627 // instantiated
0628 throw new MethodNotImplementedException(
0629 "toArray() is not implemented for SerialCorpusImpl");
0630 }
0631
0632 public Object[] toArray(Object[] a) {
0633 // there is a problem here, because some docs might not be
0634 // instantiated
0635 throw new MethodNotImplementedException(
0636 "toArray(Object[] a) is not implemented for SerialCorpusImpl");
0637 }
0638
0639 public boolean add(Object o) {
0640 if(!(o instanceof Document) || o == null) return false;
0641 Document doc = (Document)o;
0642
0643 // make it accept only docs from its own datastore
0644 if(doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) {
0645 Err.prln("Error: Persistent corpus can only accept documents "
0646 + "from its own datastore!");
0647 return false;
0648 }// if
0649
0650 // add the document with its index in the docDataList
0651 // in this case, since it's going to be added to the end
0652 // the index will be the size of the docDataList before
0653 // the addition
0654 DocumentData docData = new DocumentData(doc.getName(), doc
0655 .getLRPersistenceId(), doc.getClass().getName());
0656 boolean result = docDataList.add(docData);
0657 documents.add(doc);
0658 documentAdded(doc);
0659 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, doc, docDataList
0660 .size() - 1, doc.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED));
0661
0662 return result;
0663 }
0664
0665 public boolean remove(Object o) {
0666 if(DEBUG) Out.prln("SerialCorpus:Remove object called");
0667 if(!(o instanceof Document)) return false;
0668 Document doc = (Document)o;
0669
0670 // see if we can find it first. If not, then judt return
0671 int index = findDocument(doc);
0672 if(index == -1) return false;
0673
0674 if(index < docDataList.size()) { // we found it, so remove it
0675 // by Andrey Shafirin: this part of code can produce an exception
0676 // if
0677 // document wasn't loaded
0678 String docName = ((DocumentData)docDataList.get(index)).getDocumentName();
0679 Object docPersistentID = getDocumentPersistentID(index);
0680 docDataList.remove(index);
0681 // Document oldDoc = (Document) documents.remove(index);
0682 documents.remove(index);
0683 // if (DEBUG) Out.prln("documents after remove of " +
0684 // oldDoc.getName()
0685 // + " are " + documents);
0686 if(DEBUG)
0687 Out.prln("documents after remove of " + docName + " are " + documents);
0688 // documentRemoved(oldDoc.getLRPersistenceId().toString());
0689 if(docPersistentID != null) documentRemoved(docPersistentID.toString());
0690 // fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
0691 // oldDoc,
0692 // index,
0693 // CorpusEvent.DOCUMENT_REMOVED));
0694 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, (Document)o,
0695 index, docPersistentID, CorpusEvent.DOCUMENT_REMOVED));
0696 }
0697
0698 return true;
0699 }
0700
0701 public int findDocument(Document doc) {
0702 boolean found = false;
0703 DocumentData docData = null;
0704
0705 // first try finding the document in memory
0706 int index = documents.indexOf(doc);
0707 if(index > -1 && index < docDataList.size()) return index;
0708
0709 // else try finding a document with the same name and persistent ID
0710 Iterator iter = docDataList.iterator();
0711 for(index = 0; iter.hasNext(); index++) {
0712 docData = (DocumentData)iter.next();
0713 if(docData.getDocumentName().equals(doc.getName())
0714 && docData.getPersistentID().equals(doc.getLRPersistenceId())
0715 && docData.getClassType().equals(doc.getClass().getName())) {
0716 found = true;
0717 break;
0718 }
0719 }
0720 if(found && index < docDataList.size())
0721 return index;
0722 else return -1;
0723 }// findDocument
0724
0725 public boolean containsAll(Collection c) {
0726 Iterator iter = c.iterator();
0727 while(iter.hasNext()) {
0728 if(!contains(iter.next())) return false;
0729 }
0730 return true;
0731 }
0732
0733 public boolean addAll(Collection c) {
0734 boolean allAdded = true;
0735 Iterator iter = c.iterator();
0736 while(iter.hasNext()) {
0737 if(!add(iter.next())) allAdded = false;
0738 }
0739 return allAdded;
0740 }
0741
0742 public boolean addAll(int index, Collection c) {
0743 throw new UnsupportedOperationException();
0744 }
0745
0746 public boolean removeAll(Collection c) {
0747 boolean allRemoved = true;
0748 Iterator iter = c.iterator();
0749 while(iter.hasNext()) {
0750 if(!remove(iter.next())) allRemoved = false;
0751 }
0752 return allRemoved;
0753
0754 }
0755
0756 public boolean retainAll(Collection c) {
0757 throw new UnsupportedOperationException();
0758 }
0759
0760 public void clear() {
0761 documents.clear();
0762 docDataList.clear();
0763 }
0764
0765 public boolean equals(Object o) {
0766 if(!(o instanceof SerialCorpusImpl)) return false;
0767 SerialCorpusImpl oCorpus = (SerialCorpusImpl)o;
0768 if(oCorpus == null) return false;
0769 if(oCorpus == this) return true;
0770 if((oCorpus.lrPersistentId == this.lrPersistentId || (this.lrPersistentId != null && this.lrPersistentId
0771 .equals(oCorpus.lrPersistentId)))
0772 && oCorpus.name.equals(this.name)
0773 && (oCorpus.dataStore == this.dataStore || oCorpus.dataStore
0774 .equals(this.dataStore))
0775 && oCorpus.docDataList.equals(docDataList)) return true;
0776 return false;
0777 }
0778
0779 public int hashCode() {
0780 return docDataList.hashCode();
0781 }
0782
0783 public Object get(int index) {
0784 if(index >= docDataList.size()) return null;
0785
0786 Object res = documents.get(index);
0787
0788 if(DEBUG)
0789 Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);
0790
0791 // if the document is null, then I must get it from the DS
0792 if(res == null) {
0793 FeatureMap parameters = Factory.newFeatureMap();
0794 parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
0795 try {
0796 parameters.put(DataStore.LR_ID_FEATURE_NAME, ((DocumentData)docDataList
0797 .get(index)).getPersistentID());
0798 Resource lr = Factory.createResource(((DocumentData)docDataList
0799 .get(index)).getClassType(), parameters);
0800 if(DEBUG) Out.prln("Loaded document :" + lr.getName());
0801 // change the result to the newly loaded doc
0802 res = lr;
0803
0804 // finally replace the doc with the instantiated version
0805 documents.set(index, lr);
0806 }
0807 catch(ResourceInstantiationException ex) {
0808 Err.prln("Error reading document inside a serialised corpus.");
0809 throw new GateRuntimeException(ex);
0810 }
0811 }
0812
0813 return res;
0814 }
0815
0816 public Object set(int index, Object element) {
0817 throw new gate.util.MethodNotImplementedException();
0818 // fire the 2 events
0819 /*
0820 * fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
0821 * oldDoc, ((Integer) key).intValue(),
0822 * CorpusEvent.DOCUMENT_REMOVED)); fireDocumentAdded(new
0823 * CorpusEvent(SerialCorpusImpl.this, newDoc, ((Integer)
0824 * key).intValue(), CorpusEvent.DOCUMENT_ADDED));
0825 */
0826 }
0827
0828 public void add(int index, Object o) {
0829 if(!(o instanceof Document) || o == null) return;
0830 Document doc = (Document)o;
0831
0832 DocumentData docData = new DocumentData(doc.getName(), doc
0833 .getLRPersistenceId(), doc.getClass().getName());
0834 docDataList.add(index, docData);
0835
0836 documents.add(index, doc);
0837 documentAdded(doc);
0838 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, doc, index, doc
0839 .getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED));
0840
0841 }
0842
0843 public Object remove(int index) {
0844 if(DEBUG) Out.prln("Remove index called");
0845 // try to get the actual document if it was loaded
0846 Document res = isDocumentLoaded(index) ? (Document)get(index) : null;
0847 Object docLRID = ((DocumentData)docDataList.get(index)).persistentID;
0848 if(docLRID != null) documentRemoved(docLRID.toString());
0849 docDataList.remove(index);
0850 documents.remove(index);
0851 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, res, index,
0852 docLRID, CorpusEvent.DOCUMENT_REMOVED));
0853 return res;
0854 }
0855
0856 public int indexOf(Object o) {
0857 if(o instanceof Document) return findDocument((Document)o);
0858
0859 return -1;
0860 }
0861
0862 public int lastIndexOf(Object o) {
0863 throw new gate.util.MethodNotImplementedException();
0864 }
0865
0866 public ListIterator listIterator() {
0867 throw new gate.util.MethodNotImplementedException();
0868 }
0869
0870 public ListIterator listIterator(int index) {
0871 throw new gate.util.MethodNotImplementedException();
0872 }
0873
0874 /**
0875 * persistent Corpus does not support this method as all the documents
0876 * might no be in memory
0877 */
0878 public List subList(int fromIndex, int toIndex) {
0879 throw new gate.util.MethodNotImplementedException();
0880 }
0881
0882 public void setDataStore(DataStore dataStore)
0883 throws gate.persist.PersistenceException {
0884 super.setDataStore(dataStore);
0885 if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
0886 }
0887
0888 public void setTransientSource(Object source) {
0889 if(!(source instanceof Corpus)) return;
0890
0891 // the following initialisation is only valid when we're
0892 // constructing
0893 // this object from a transient one. If it has already been stored
0894 // in
0895 // a datastore, then the initialisation is done in readObject()
0896 // since
0897 // this method is the one called by serialisation, when objects
0898 // are restored.
0899 if(this.dataStore != null && this.lrPersistentId != null) return;
0900
0901 Corpus tCorpus = (Corpus)source;
0902
0903 // copy the corpus name and features from the one in memory
0904 this.setName(tCorpus.getName());
0905 this.setFeatures(tCorpus.getFeatures());
0906
0907 docDataList = new ArrayList();
0908 // now cache the names of all docs for future use
0909 List docNames = tCorpus.getDocumentNames();
0910 for(int i = 0; i < docNames.size(); i++) {
0911 Document aDoc = (Document)tCorpus.get(i);
0912 docDataList.add(new DocumentData((String)docNames.get(i), null, aDoc
0913 .getClass().getName()));
0914 }
0915
0916 // copy all the documents from the transient corpus
0917 documents = new ArrayList();
0918 documents.addAll(tCorpus);
0919
0920 this.addedDocs = new Vector();
0921 this.removedDocIDs = new Vector();
0922 this.changedDocs = new Vector();
0923
0924 // make sure we fire events when docs are added/removed/etc
0925 Gate.getCreoleRegister().addCreoleListener(this);
0926
0927 }
0928
0929 // we don't keep the transient source, so always return null
0930 // Sill this must be implemented, coz of the GUI and Factory
0931 public Object getTransientSource() {
0932 return null;
0933 }
0934
0935 public Resource init() throws gate.creole.ResourceInstantiationException {
0936 super.init();
0937
0938 return this;
0939
0940 }
0941
0942 /**
0943 * readObject - calls the default readObject() and then initialises
0944 * the transient data
0945 *
0946 * @serialData Read serializable fields. No optional data read.
0947 */
0948 private void readObject(ObjectInputStream s) throws IOException,
0949 ClassNotFoundException {
0950 s.defaultReadObject();
0951 documents = new ArrayList(docDataList.size());
0952 for(int i = 0; i < docDataList.size(); i++)
0953 documents.add(null);
0954 corpusListeners = new Vector();
0955 // finally set the creole listeners if the LR is like that
0956 Gate.getCreoleRegister().addCreoleListener(this);
0957 if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
0958
0959 // if indexed construct the manager.
0960 IndexDefinition definition = (IndexDefinition)this.getFeatures().get(
0961 GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
0962 if(definition != null) {
0963 String className = definition.getIrEngineClassName();
0964 try {
0965 // Class aClass = Class.forName(className);
0966 Class aClass = Class.forName(className, true, Gate.getClassLoader());
0967 IREngine engine = (IREngine)aClass.newInstance();
0968 this.indexManager = engine.getIndexmanager();
0969 this.indexManager.setIndexDefinition(definition);
0970 this.indexManager.setCorpus(this);
0971 }
0972 catch(Exception e) {
0973 e.printStackTrace(Err.getPrintWriter());
0974 }
0975 // switch (definition.getIndexType()) {
0976 // case GateConstants.IR_LUCENE_INVFILE:
0977 // this.indexManager = new LuceneIndexManager();
0978 // this.indexManager.setIndexDefinition(definition);
0979 // this.indexManager.setCorpus(this);
0980 // break;
0981 // }
0982 this.addedDocs = new Vector();
0983 this.removedDocIDs = new Vector();
0984 this.changedDocs = new Vector();
0985 }
0986 }// readObject
0987
0988 public void setIndexDefinition(IndexDefinition definition) {
0989 if(definition != null) {
0990 this.getFeatures().put(GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY,
0991 definition);
0992
0993 String className = definition.getIrEngineClassName();
0994 try {
0995 // Class aClass = Class.forName(className);
0996 Class aClass = Class.forName(className, true, Gate.getClassLoader());
0997 IREngine engine = (IREngine)aClass.newInstance();
0998 this.indexManager = engine.getIndexmanager();
0999 this.indexManager.setIndexDefinition(definition);
1000 this.indexManager.setCorpus(this);
1001 }
1002 catch(Exception e) {
1003 e.printStackTrace(Err.getPrintWriter());
1004 }
1005 // switch (definition.getIndexType()) {
1006 // case GateConstants.IR_LUCENE_INVFILE:
1007 // this.indexManager = new LuceneIndexManager();
1008 // this.indexManager.setIndexDefinition(definition);
1009 // this.indexManager.setCorpus(this);
1010 // break;
1011 // }
1012 this.addedDocs = new Vector();
1013 this.removedDocIDs = new Vector();
1014 this.changedDocs = new Vector();
1015 }
1016 }
1017
1018 public IndexDefinition getIndexDefinition() {
1019 return (IndexDefinition)this.getFeatures().get(
1020 GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
1021 }
1022
1023 public IndexManager getIndexManager() {
1024 return this.indexManager;
1025 }
1026
1027 public IndexStatistics getIndexStatistics() {
1028 return (IndexStatistics)this.getFeatures().get(
1029 GateConstants.CORPUS_INDEX_STATISTICS_FEATURE_KEY);
1030 }
1031
1032 private void documentAdded(Document doc) {
1033 if(indexManager != null) {
1034 addedDocs.add(doc);
1035 }
1036 }
1037
1038 private void documentRemoved(String lrID) {
1039 if(indexManager != null) {
1040 removedDocIDs.add(lrID);
1041 }
1042 }
1043
1044 private void thisResourceWritten() {
1045 if(indexManager != null) {
1046 try {
1047 for(int i = 0; i < documents.size(); i++) {
1048 if(documents.get(i) != null) {
1049 Document doc = (Document)documents.get(i);
1050 if(!addedDocs.contains(doc) && doc.isModified()) {
1051 changedDocs.add(doc);
1052 }
1053 }
1054 }
1055 indexManager.sync(addedDocs, removedDocIDs, changedDocs);
1056 }
1057 catch(IndexException ie) {
1058 ie.printStackTrace();
1059 }
1060 }
1061 }
1062
1063 /**
1064 * SerialCorpusImpl does not support duplication.
1065 */
1066 public Resource duplicate(Factory.DuplicationContext ctx)
1067 throws ResourceInstantiationException {
1068 throw new ResourceInstantiationException("Duplication of "
1069 + this.getClass().getName() + " not permitted");
1070 }
1071
1072 }
|