001 /*
002 * SimpleCorpus.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Kalina Bontcheva, 23/Jul/2004
013 *
014 * $Id: SimpleCorpus.java 13519 2011-03-09 18:46:22Z nirajaswani $
015 */
016
017 package gate;
018
019 import gate.corpora.DocType;
020 import gate.creole.ResourceInstantiationException;
021 import gate.util.NameBearer;
022
023 import java.io.FileFilter;
024 import java.io.IOException;
025 import java.net.URL;
026 import java.util.List;
027
028 /**
029 * Corpora are lists of Document. TIPSTER equivalent: Collection.
030 */
031 public interface SimpleCorpus extends LanguageResource, List, NameBearer {
032
033 public static final String CORPUS_NAME_PARAMETER_NAME = "name";
034
035 public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList";
036
037 /**
038 * Gets the names of the documents in this corpus.
039 *
040 * @return a {@link List} of Strings representing the names of the
041 * documents in this corpus.
042 */
043 public List<String> getDocumentNames();
044
045 /**
046 * Gets the name of a document in this corpus.
047 *
048 * @param index the index of the document
049 * @return a String value representing the name of the document at
050 * <tt>index</tt> in this corpus.
051 */
052 public String getDocumentName(int index);
053
054 /**
055 * Fills this corpus with documents created on the fly from selected
056 * files in a directory. Uses a {@link FileFilter} to select which
057 * files will be used and which will be ignored. A simple file filter
058 * based on extensions is provided in the Gate distribution (
059 * {@link gate.util.ExtensionFileFilter}).
060 *
061 * @param directory the directory from which the files will be picked.
062 * This parameter is an URL for uniformity. It needs to be a
063 * URL of type file otherwise an InvalidArgumentException
064 * will be thrown. An implementation for this method is
065 * provided as a static method at
066 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
067 * .
068 * @param filter the file filter used to select files from the target
069 * directory. If the filter is <tt>null</tt> all the files
070 * will be accepted.
071 * @param encoding the encoding to be used for reading the documents
072 * @param recurseDirectories should the directory be parsed
073 * recursively?. If <tt>true</tt> all the files from the
074 * provided directory and all its children directories (on as
075 * many levels as necessary) will be picked if accepted by
076 * the filter otherwise the children directories will be
077 * ignored.
078 */
079 public void populate(URL directory, FileFilter filter, String encoding,
080 boolean recurseDirectories) throws IOException,
081 ResourceInstantiationException;
082
083 /**
084 * Fills this corpus with documents created on the fly from selected
085 * files in a directory. Uses a {@link FileFilter} to select which
086 * files will be used and which will be ignored. A simple file filter
087 * based on extensions is provided in the Gate distribution (
088 * {@link gate.util.ExtensionFileFilter}).
089 *
090 * @param directory the directory from which the files will be picked.
091 * This parameter is an URL for uniformity. It needs to be a
092 * URL of type file otherwise an InvalidArgumentException
093 * will be thrown. An implementation for this method is
094 * provided as a static method at
095 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}
096 * .
097 * @param filter the file filter used to select files from the target
098 * directory. If the filter is <tt>null</tt> all the files
099 * will be accepted.
100 * @param encoding the encoding to be used for reading the documents
101 *@param mimeType the mime type to be used when loading documents. If
102 * null, then the mime type will be automatically determined.
103 * @param recurseDirectories should the directory be parsed
104 * recursively?. If <tt>true</tt> all the files from the
105 * provided directory and all its children directories (on as
106 * many levels as necessary) will be picked if accepted by
107 * the filter otherwise the children directories will be
108 * ignored.
109 */
110 public void populate(URL directory, FileFilter filter, String encoding,
111 String mimeType, boolean recurseDirectories) throws IOException,
112 ResourceInstantiationException;
113
114 /**
115 * Fills the provided corpus with documents extracted from the
116 * provided trec file.
117 *
118 * @param singleConcatenatedFile the file with multiple documents in it.
119 * @param documentRootElement content between the start and end of
120 * this element is considered for documents.
121 * @param encoding the encoding of the trec file.
122 * @param numberOfFilesToExtract indicates the number of files to
123 * extract from the concatenated file. -1 to indicate all
124 * files.
125 * @param documentNamePrefix the prefix to use for document names when
126 * creating from
127 * @param documentType type of the document (i.e. xml, html etc.)
128 * @return total length of populated documents in the corpus in number
129 * of bytes
130 */
131 public long populate(URL singleConcatenatedFile, String documentRootElement,
132 String encoding, int numberOfDocumentsToExtract,
133 String documentNamePrefix, DocType documentType) throws IOException,
134 ResourceInstantiationException;
135
136 } // interface SimpleCorpus
|