001 /*
002 * CorpusImpl.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Hamish Cunningham, 11/Feb/2000
013 *
014 * $Id: CorpusImpl.java 13525 2011-03-11 14:37:49Z nirajaswani $
015 */
016
017 package gate.corpora;
018
019 import gate.Corpus;
020 import gate.Document;
021 import gate.Factory;
022 import gate.FeatureMap;
023 import gate.Gate;
024 import gate.Resource;
025 import gate.creole.AbstractLanguageResource;
026 import gate.creole.CustomDuplication;
027 import gate.creole.ResourceInstantiationException;
028 import gate.creole.metadata.CreoleParameter;
029 import gate.creole.metadata.CreoleResource;
030 import gate.creole.metadata.Optional;
031 import gate.event.CorpusEvent;
032 import gate.event.CorpusListener;
033 import gate.event.CreoleEvent;
034 import gate.event.CreoleListener;
035 import gate.event.StatusListener;
036 import gate.util.BomStrippingInputStreamReader;
037 import gate.util.Err;
038 import gate.util.Files;
039 import gate.util.Strings;
040
041 import java.io.BufferedReader;
042 import java.io.BufferedWriter;
043 import java.io.File;
044 import java.io.FileFilter;
045 import java.io.FileInputStream;
046 import java.io.FileNotFoundException;
047 import java.io.FileOutputStream;
048 import java.io.FileWriter;
049 import java.io.IOException;
050 import java.io.OutputStreamWriter;
051 import java.io.Serializable;
052 import java.net.URISyntaxException;
053 import java.net.URL;
054 import java.util.AbstractList;
055 import java.util.ArrayList;
056 import java.util.Arrays;
057 import java.util.Collection;
058 import java.util.Collections;
059 import java.util.Comparator;
060 import java.util.Iterator;
061 import java.util.List;
062 import java.util.ListIterator;
063 import java.util.Vector;
064
065 /**
066 * Corpora are sets of Document. They are ordered by lexicographic
067 * collation on Url.
068 */
069 @CreoleResource(name = "GATE Corpus", comment = "GATE transient corpus.", interfaceName = "gate.Corpus", icon = "corpus-trans", helpURL = "http://gate.ac.uk/userguide/sec:developer:loadlr")
070 public class CorpusImpl extends AbstractLanguageResource implements Corpus,
071 CreoleListener,
072 CustomDuplication {
073
074 /** Debug flag */
075 private static final boolean DEBUG = false;
076
077 public CorpusImpl() {
078 supportList = Collections.synchronizedList(new VerboseList());
079 Gate.getCreoleRegister().addCreoleListener(this);
080 }
081
082 /**
083 * Gets the names of the documents in this corpus.
084 *
085 * @return a {@link List} of Strings representing the names of the
086 * documents in this corpus.
087 */
088 public List<String> getDocumentNames() {
089 ArrayList<String> res = new ArrayList<String>(supportList.size());
090 for(Object document : supportList) {
091 res.add(((Document)document).getName());
092 }
093 return res;
094 }
095
096 /**
097 * Gets the name of a document in this corpus.
098 *
099 * @param index the index of the document
100 * @return a String value representing the name of the document at
101 * <tt>index</tt> in this corpus.
102 */
103 public String getDocumentName(int index) {
104 return ((Document)supportList.get(index)).getName();
105 }
106
107 /**
108 * This method does not make sense for transient corpora, so it does
109 * nothing.
110 */
111 public void unloadDocument(Document doc) {
112 return;
113 }
114
115 /**
116 * The underlying list that holds the documents in this corpus.
117 */
118 protected List supportList = null;
119
120 /**
121 * A proxy list that stores the actual data in an internal list and
122 * forwards all operations to that one but it also fires the
123 * appropiate corpus events when necessary. It also does some type
124 * checking so only Documents are accepted as corpus members.
125 */
126 protected class VerboseList extends AbstractList implements Serializable {
127
128 VerboseList() {
129 data = new ArrayList();
130 }
131
132 public Object get(int index) {
133 return data.get(index);
134 }
135
136 public int size() {
137 return data.size();
138 }
139
140 public Object set(int index, Object element) {
141 if(element instanceof Document) {
142 Document oldDoc = (Document)data.set(index, element);
143 Document newDoc = (Document)element;
144
145 // fire the 2 events
146 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
147 CorpusEvent.DOCUMENT_REMOVED));
148 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, newDoc, index,
149 CorpusEvent.DOCUMENT_ADDED));
150 return oldDoc;
151 }
152 else {
153 throw new UnsupportedOperationException(getClass().getName()
154 + " only accepts gate.Document values as members!\n"
155 + element.getClass().getName() + " is not a gate.Document");
156 }
157 }
158
159 public void add(int index, Object element) {
160 if(element instanceof Document) {
161 data.add(index, element);
162
163 // fire the event
164 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, (Document)element,
165 index, CorpusEvent.DOCUMENT_ADDED));
166 }
167 else {
168 throw new UnsupportedOperationException(getClass().getName()
169 + " only accepts gate.Document values as members!\n"
170 + element.getClass().getName() + " is not a gate.Document");
171 }
172 }
173
174 public Object remove(int index) {
175 Document oldDoc = (Document)data.remove(index);
176
177 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, oldDoc, index,
178 CorpusEvent.DOCUMENT_REMOVED));
179 return oldDoc;
180 }
181
182 /**
183 * The List containing the actual data.
184 */
185 ArrayList data;
186 }
187
188 /**
189 * This method returns true when the document is already loaded in
190 * memory
191 */
192 public boolean isDocumentLoaded(int index) {
193 return true;
194 }
195
196 protected void clearDocList() {
197 if(supportList == null) return;
198 supportList.clear();
199 }
200
201 // List methods
202 // java docs will be automatically copied from the List interface.
203
204 public int size() {
205 return supportList.size();
206 }
207
208 public boolean isEmpty() {
209 return supportList.isEmpty();
210 }
211
212 public boolean contains(Object o) {
213 return supportList.contains(o);
214 }
215
216 public Iterator iterator() {
217 return supportList.iterator();
218 }
219
220 public Object[] toArray() {
221 return supportList.toArray();
222 }
223
224 public Object[] toArray(Object[] a) {
225 return supportList.toArray(a);
226 }
227
228 public boolean add(Object o) {
229 return supportList.add(o);
230 }
231
232 public boolean remove(Object o) {
233 return supportList.remove(o);
234 }
235
236 public boolean containsAll(Collection c) {
237 return supportList.containsAll(c);
238 }
239
240 public boolean addAll(Collection c) {
241 return supportList.addAll(c);
242 }
243
244 public boolean addAll(int index, Collection c) {
245 return supportList.addAll(index, c);
246 }
247
248 public boolean removeAll(Collection c) {
249 return supportList.removeAll(c);
250 }
251
252 public boolean retainAll(Collection c) {
253 return supportList.retainAll(c);
254 }
255
256 public void clear() {
257 supportList.clear();
258 }
259
260 public boolean equals(Object o) {
261 if(!(o instanceof CorpusImpl)) return false;
262
263 return supportList.equals(o);
264 }
265
266 public int hashCode() {
267 return supportList.hashCode();
268 }
269
270 public Object get(int index) {
271 return supportList.get(index);
272 }
273
274 public Object set(int index, Object element) {
275 return supportList.set(index, element);
276 }
277
278 public void add(int index, Object element) {
279 supportList.add(index, element);
280 }
281
282 public Object remove(int index) {
283 return supportList.remove(index);
284 }
285
286 public int indexOf(Object o) {
287 return supportList.indexOf(o);
288 }
289
290 public int lastIndexOf(Object o) {
291 return supportList.lastIndexOf(o);
292 }
293
294 public ListIterator listIterator() {
295 return supportList.listIterator();
296 }
297
298 public ListIterator listIterator(int index) {
299 return supportList.listIterator(index);
300 }
301
302 public List subList(int fromIndex, int toIndex) {
303 return supportList.subList(fromIndex, toIndex);
304 }
305
306 /** Construction */
307
308 public void cleanup() {
309 Gate.getCreoleRegister().removeCreoleListener(this);
310 }
311
312 /** Initialise this resource, and return it. */
313 public Resource init() {
314 if(documentsList != null && !documentsList.isEmpty()) {
315 addAll(documentsList);
316 }
317 return this;
318 } // init()
319
320 /**
321 * Fills the provided corpus with documents created on the fly from
322 * selected files in a directory. Uses a {@link FileFilter} to select
323 * which files will be used and which will be ignored. A simple file
324 * filter based on extensions is provided in the Gate distribution (
325 * {@link gate.util.ExtensionFileFilter}).
326 *
327 * @param corpus the corpus to be populated
328 * @param directory the directory from which the files will be picked.
329 * This parameter is an URL for uniformity. It needs to be a
330 * URL of type file otherwise an InvalidArgumentException
331 * will be thrown.
332 * @param filter the file filter used to select files from the target
333 * directory. If the filter is <tt>null</tt> all the files
334 * will be accepted.
335 * @param encoding the encoding to be used for reading the documents
336 * @param recurseDirectories should the directory be parsed
337 * recursively?. If <tt>true</tt> all the files from the
338 * provided directory and all its children directories (on as
339 * many levels as necessary) will be picked if accepted by
340 * the filter otherwise the children directories will be
341 * ignored.
342 * @throws java.io.IOException if a file doesn't exist
343 */
344 public static void populate(Corpus corpus, URL directory, FileFilter filter,
345 String encoding, boolean recurseDirectories) throws IOException {
346 populate(corpus, directory, filter, encoding, null, recurseDirectories);
347 }
348
349 /**
350 * Fills the provided corpus with documents created on the fly from
351 * selected files in a directory. Uses a {@link FileFilter} to select
352 * which files will be used and which will be ignored. A simple file
353 * filter based on extensions is provided in the Gate distribution (
354 * {@link gate.util.ExtensionFileFilter}).
355 *
356 * @param corpus the corpus to be populated
357 * @param directory the directory from which the files will be picked.
358 * This parameter is an URL for uniformity. It needs to be a
359 * URL of type file otherwise an InvalidArgumentException
360 * will be thrown.
361 * @param filter the file filter used to select files from the target
362 * directory. If the filter is <tt>null</tt> all the files
363 * will be accepted.
364 * @param encoding the encoding to be used for reading the documents
365 * @param recurseDirectories should the directory be parsed
366 * recursively?. If <tt>true</tt> all the files from the
367 * provided directory and all its children directories (on as
368 * many levels as necessary) will be picked if accepted by
369 * the filter otherwise the children directories will be
370 * ignored.
371 * @throws java.io.IOException if a file doesn't exist
372 */
373 public static void populate(Corpus corpus, URL directory, FileFilter filter,
374 String encoding, String mimeType, boolean recurseDirectories)
375 throws IOException {
376
377 // check input
378 if(!directory.getProtocol().equalsIgnoreCase("file"))
379 throw new IllegalArgumentException(
380 "The URL provided is not of type \"file:\"!");
381
382 File dir = Files.fileFromURL(directory);
383 if(!dir.exists()) throw new FileNotFoundException(dir.toString());
384
385 if(!dir.isDirectory())
386 throw new IllegalArgumentException(dir.getAbsolutePath()
387 + " is not a directory!");
388
389 File[] files;
390 // populate the corpus
391 if(recurseDirectories) {
392 files = Files.listFilesRecursively(dir, filter);
393 }
394 else {
395 files = dir.listFiles(filter);
396 }
397
398 if(files == null) {
399 return;
400 }
401
402 // sort the files alphabetically regardless of their paths
403 Arrays.sort(files, new Comparator<File>() {
404 public int compare(File f1, File f2) {
405 return f1.getName().compareTo(f2.getName());
406 }
407 });
408
409 // create the GATE documents
410 for(File file : files) {
411 if(file.isDirectory()) {
412 continue;
413 }
414 StatusListener sListener = (StatusListener)Gate.getListeners().get(
415 "gate.event.StatusListener");
416 if(sListener != null)
417 sListener.statusChanged("Reading: " + file.getName());
418 String docName = file.getName() + "_" + Gate.genSym();
419 FeatureMap params = Factory.newFeatureMap();
420 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, file.toURI().toURL());
421 if(encoding != null)
422 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
423 if(mimeType != null)
424 params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
425
426 try {
427 Document doc = (Document)Factory.createResource(DocumentImpl.class
428 .getName(), params, null, docName);
429 corpus.add(doc);
430 if(corpus.getLRPersistenceId() != null) {
431 // persistent corpus -> unload the document
432 corpus.unloadDocument(doc);
433 Factory.deleteResource(doc);
434 }
435 }
436 catch(Throwable t) {
437 String nl = Strings.getNl();
438 Err.prln("WARNING: Corpus.populate could not instantiate document" + nl
439 + " Document name was: " + docName + nl + " Exception was: "
440 + t + nl + nl);
441 t.printStackTrace();
442 }
443 if(sListener != null) sListener.statusChanged(file.getName() + " read");
444 }
445
446 }// public static void populate
447
448 /**
449 * Fills this corpus with documents created from files in a directory.
450 *
451 * @param filter the file filter used to select files from the target
452 * directory. If the filter is <tt>null</tt> all the files
453 * will be accepted.
454 * @param directory the directory from which the files will be picked.
455 * This parameter is an URL for uniformity. It needs to be a
456 * URL of type file otherwise an InvalidArgumentException
457 * will be thrown. An implementation for this method is
458 * provided as a static method at
459 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
460 * .
461 * @param encoding the encoding to be used for reading the documents
462 * @param recurseDirectories should the directory be parsed
463 * recursively?. If <tt>true</tt> all the files from the
464 * provided directory and all its children directories (on as
465 * many levels as necessary) will be picked if accepted by
466 * the filter otherwise the children directories will be
467 * ignored.
468 */
469 public void populate(URL directory, FileFilter filter, String encoding,
470 boolean recurseDirectories) throws IOException,
471 ResourceInstantiationException {
472 populate(this, directory, filter, encoding, null, recurseDirectories);
473 }
474
475 /**
476 * Fills this corpus with documents created from files in a directory.
477 *
478 * @param filter the file filter used to select files from the target
479 * directory. If the filter is <tt>null</tt> all the files
480 * will be accepted.
481 * @param directory the directory from which the files will be picked.
482 * This parameter is an URL for uniformity. It needs to be a
483 * URL of type file otherwise an InvalidArgumentException
484 * will be thrown. An implementation for this method is
485 * provided as a static method at
486 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
487 * .
488 * @param encoding the encoding to be used for reading the documents
489 *@param mimeType the mime type to be used when loading documents. If
490 * null, then the mime type will be detected automatically.
491 *
492 * @param recurseDirectories should the directory be parsed
493 * recursively?. If <tt>true</tt> all the files from the
494 * provided directory and all its children directories (on as
495 * many levels as necessary) will be picked if accepted by
496 * the filter otherwise the children directories will be
497 * ignored.
498 */
499 public void populate(URL directory, FileFilter filter, String encoding,
500 String mimeType, boolean recurseDirectories) throws IOException,
501 ResourceInstantiationException {
502 populate(this, directory, filter, encoding, mimeType, recurseDirectories);
503 }
504
505 private static String replaceAmpChars(String s) {
506 s = s.replaceAll("&", "&");
507 // s = s.replaceAll("<","<");
508 // s = s.replaceAll(">",">");
509 // s = s.replaceAll("\"",""");
510 // s = s.replaceAll("'","'");
511 return s;
512 }
513
514 /**
515 * Fills the provided corpus with documents extracted from the
516 * provided trec file.
517 *
518 * @param corpus the corpus to be populated.
519 * @param singleConcatenatedFile the trec file.
520 * @param documentRootElement text between this element (start and
521 * end) is considered for creating a new document.
522 * @param encoding the encoding of the trec file.
523 * @param numberOfDocumentsToExtract extracts the specified number of
524 * documents from the trecweb file; -1 to indicate all files.
525 * @param documentType type of the document it is (i.e. xml, html etc)
526 * @return total length of populated documents in the corpus in number
527 * of bytes
528 * @throws java.io.IOException
529 */
530 public static long populate(Corpus corpus, URL singleConcatenatedFile,
531 String documentRootElement, String encoding,
532 int numberOfDocumentsToExtract, String documentNamePrefix,
533 DocType documentType) throws IOException {
534
535 // obtain the root element that user has provided
536 // content between the start and end of root element is considered
537 // for creating documents
538 documentRootElement = documentRootElement.toLowerCase();
539
540 // document name prefix could be an empty string
541 documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix
542 .trim()
543 + "_";
544
545 // starting to read the file
546 File dir = null;
547 try {
548 dir = new File(singleConcatenatedFile.toURI());
549 }
550 catch(URISyntaxException use) {
551 throw new IOException(use.getMessage());
552 }
553
554 // it must exist
555 if(!dir.exists()) throw new FileNotFoundException(dir.toString());
556
557 // we are expecting a file
558 if(dir.isDirectory())
559 throw new IllegalArgumentException(dir.getAbsolutePath()
560 + " is a directory!");
561
562 // we start a new document when we find <documentRootElement> and
563 // close it
564 // when we find </documentRootElement>
565 BufferedReader br = null;
566 try {
567 String encodingLine = "";
568 if(encoding != null && encoding.trim().length() != 0) {
569 br = new BomStrippingInputStreamReader(new FileInputStream(dir),
570 encoding, 10485760);
571
572 // if xml add the xml line at the top
573 if(documentType == DocType.XML)
574 encodingLine = "<?xml version=\"1.0\" encoding=\"" + encoding
575 + "\" ?>";
576 }
577 else {
578 br = new BomStrippingInputStreamReader(new FileInputStream(dir),
579 10485760);
580
581 // if xml add the xml line at the top
582 if(documentType == DocType.XML)
583 encodingLine = "<?xml version=\"1.0\" ?>";
584 }
585
586 // reading line by line
587 String line = br.readLine();
588
589 // this is where we store document content
590 StringBuilder documentString = new StringBuilder();
591
592 // toggle switch to indicate search for start element
593 boolean searchingForStartElement = true;
594
595 // keeping count of number of documents extracted
596 int count = 1;
597
598 // length in bytes read so far (to return)
599 long lengthInBytes = 0;
600
601 // continue until reached the end of file
602 while(line != null) {
603
604 // already extracted requested num of documents?
605 if(numberOfDocumentsToExtract != -1
606 && (count - 1) == numberOfDocumentsToExtract) break;
607
608 // lowercase the line in order to match documentRootElement in
609 // any case
610 String lowerCasedLine = line.toLowerCase();
611
612 // if searching for startElement?
613 if(searchingForStartElement) {
614
615 // may be its with attributes
616 int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
617
618 // may be no attributes?
619 if(index < 0) {
620 index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
621 }
622
623 // if index <0, we are out of the content boundaries, so
624 // simply
625 // skip the current line and start reading from the next line
626 if(index < 0) {
627 line = br.readLine();
628 continue;
629 }
630 else {
631
632 // if found, that's the first line
633 documentString.append(encodingLine + "\n" + line.substring(index)
634 + "\n");
635 searchingForStartElement = false;
636 line = br.readLine();
637 continue;
638 }
639 }
640 else {
641
642 // now searching for last element
643 int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
644
645 // if not found.. this is the content of a new document
646 if(index < 0) {
647 documentString.append(line + "\n");
648 line = br.readLine();
649 continue;
650 }
651 else {
652
653 // found.. then end the document
654 documentString.append(line.substring(0, index
655 + documentRootElement.length() + 3));
656
657 // getting ready for the next document
658 searchingForStartElement = true;
659
660 // here lets create a new document
661 // create the doc
662 StatusListener sListener = (StatusListener)gate.Gate.getListeners()
663 .get("gate.event.StatusListener");
664 if(sListener != null)
665 sListener.statusChanged("Reading File Number :" + count);
666 String docName = documentNamePrefix + count + "_" + Gate.genSym();
667 FeatureMap params = Factory.newFeatureMap();
668
669 // lets store this on a disc
670 File tempOutputFile = null;
671 String suffix = "";
672 if(documentType == DocType.XML) {
673 suffix = ".xml";
674 }
675 else if(documentType == DocType.HTML) {
676 suffix = ".html";
677 }
678
679 tempOutputFile = File.createTempFile(docName, suffix);
680 if(sListener != null)
681 sListener.statusChanged("Writing it on disk :"
682 + tempOutputFile.getAbsolutePath());
683
684 BufferedWriter writer = null;
685
686 // proper handing of io calls
687 try {
688 if(encoding != null && encoding.trim().length() > 0) {
689 writer = new BufferedWriter(new OutputStreamWriter(
690 new FileOutputStream(tempOutputFile), encoding));
691 }
692 else {
693 writer = new BufferedWriter(new FileWriter(tempOutputFile));
694 }
695
696 if(documentType == DocType.XML)
697 writer.write(replaceAmpChars(documentString.toString()));
698 else writer.write(documentString.toString());
699 }
700 catch(IOException ioe) {
701 String nl = Strings.getNl();
702 Err
703 .prln("WARNING: Corpus.populate could not instantiate document"
704 + nl
705 + " Document name was: "
706 + docName
707 + nl
708 + " Exception was: " + ioe + nl + nl);
709 ioe.printStackTrace();
710 }
711 finally {
712 if(writer != null) writer.close();
713 }
714
715 // lets create the gate document
716 if(sListener != null)
717 sListener.statusChanged("Creating GATE document for :"
718 + tempOutputFile.getAbsolutePath());
719
720 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, tempOutputFile
721 .toURI().toURL());
722
723 // calculate the length
724 lengthInBytes += documentString.toString().getBytes().length;
725 if(encoding != null && encoding.trim().length() > 0)
726 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
727
728 try {
729 Document doc = (Document)Factory.createResource(
730 DocumentImpl.class.getName(), params, null, docName);
731 count++;
732 corpus.add(doc);
733 if(corpus.getLRPersistenceId() != null) {
734 // persistent corpus -> unload the document
735 corpus.unloadDocument(doc);
736 Factory.deleteResource(doc);
737 }
738 }
739 catch(Throwable t) {
740 String nl = Strings.getNl();
741 Err
742 .prln("WARNING: Corpus.populate could not instantiate document"
743 + nl
744 + " Document name was: "
745 + docName
746 + nl
747 + " Exception was: " + t + nl + nl);
748 t.printStackTrace();
749 }
750 finally {
751 // delete the temporary file created for this document
752 writer = null;
753 tempOutputFile.delete();
754 }
755
756 documentString = new StringBuilder();
757 if(sListener != null)
758 sListener.statusChanged(docName + " created!");
759
760 if(line.length() > index + 7)
761 line = line.substring(index + 6);
762 else line = br.readLine();
763
764 continue;
765 }
766 }
767 }
768 return lengthInBytes;
769 }
770 finally {
771 if(br != null) br.close();
772 }
773 }// public static void populate
774
775 /**
776 * Fills the provided corpus with documents extracted from the
777 * provided single concatenated file.
778 *
779 * @param trecFile the trec file.
780 * @param documentRootElement content between the start and end of
781 * this element is considered for documents.
782 * @param encoding the encoding of the trec file.
783 * @param numberOfFilesToExtract indicates the number of files to
784 * extract from the trecweb file.
785 * @param documentNamePrefix the prefix to use for document names when
786 * creating from
787 * @param documentType type of the document it is (i.e. html, xml)
788 * @return total length of populated documents in the corpus in number
789 * of bytes
790 */
791 public long populate(URL singleConcatenatedFile, String documentRootElement,
792 String encoding, int numberOfFilesToExtract,
793 String documentNamePrefix, DocType documentType) throws IOException,
794 ResourceInstantiationException {
795 return populate(this, singleConcatenatedFile, documentRootElement,
796 encoding, numberOfFilesToExtract, documentNamePrefix, documentType);
797 }
798
799 public synchronized void removeCorpusListener(CorpusListener l) {
800 if(corpusListeners != null && corpusListeners.contains(l)) {
801 Vector v = (Vector)corpusListeners.clone();
802 v.removeElement(l);
803 corpusListeners = v;
804 }
805 }
806
807 public synchronized void addCorpusListener(CorpusListener l) {
808 Vector v = corpusListeners == null
809 ? new Vector(2)
810 : (Vector)corpusListeners.clone();
811 if(!v.contains(l)) {
812 v.addElement(l);
813 corpusListeners = v;
814 }
815 }
816
817 /**
818 * Custom duplication for a corpus - duplicate this corpus in the
819 * usual way, then duplicate the documents in this corpus and add them
820 * to the duplicate.
821 */
822 public Resource duplicate(Factory.DuplicationContext ctx)
823 throws ResourceInstantiationException {
824 Corpus newCorpus = (Corpus)Factory.defaultDuplicate(this, ctx);
825 for(Document d : (List<Document>)this) {
826 newCorpus.add((Document)Factory.duplicate(d, ctx));
827 }
828 return newCorpus;
829 }
830
831 /** Freeze the serialization UID. */
832 static final long serialVersionUID = -1113142759053898456L;
833
834 private transient Vector corpusListeners;
835
836 protected transient java.util.List documentsList;
837
838 protected void fireDocumentAdded(CorpusEvent e) {
839 if(corpusListeners != null) {
840 Vector listeners = corpusListeners;
841 int count = listeners.size();
842 for(int i = 0; i < count; i++) {
843 ((CorpusListener)listeners.elementAt(i)).documentAdded(e);
844 }
845 }
846 }
847
848 protected void fireDocumentRemoved(CorpusEvent e) {
849 if(corpusListeners != null) {
850 Vector listeners = corpusListeners;
851 int count = listeners.size();
852 for(int i = 0; i < count; i++) {
853 ((CorpusListener)listeners.elementAt(i)).documentRemoved(e);
854 }
855 }
856 }
857
858 @Optional
859 @CreoleParameter(collectionElementType = Document.class, comment = "A list of GATE documents")
860 public void setDocumentsList(java.util.List documentsList) {
861 this.documentsList = documentsList;
862 }
863
864 public java.util.List getDocumentsList() {
865 return documentsList;
866 }
867
868 public void resourceLoaded(CreoleEvent e) {
869 }
870
871 public void resourceUnloaded(CreoleEvent e) {
872 Resource res = e.getResource();
873 // remove all occurences
874 if(res instanceof Document) while(contains(res))
875 remove(res);
876 }
877
878 public void resourceRenamed(Resource resource, String oldName, String newName) {
879 }
880
881 public void datastoreOpened(CreoleEvent e) {
882 }
883
884 public void datastoreCreated(CreoleEvent e) {
885 }
886
887 public void datastoreClosed(CreoleEvent e) {
888 }
889 } // class CorpusImpl
|