001 /*
002 * CookBook.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Hamish Cunningham, 16/Feb/2000
013 *
014 * $Id: CookBook.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate;
018
019 import java.io.*;
020 import java.util.*;
021
022 import junit.framework.*;
023
024 import gate.creole.*;
025 import gate.creole.gazetteer.DefaultGazetteer;
026 import gate.creole.orthomatcher.OrthoMatcher;
027 import gate.creole.splitter.SentenceSplitter;
028 import gate.creole.tokeniser.DefaultTokeniser;
029 import gate.util.*;
030
031
032 /**
033 * <P><B>NOTE: this class has been REPLACED by the GateExamples package;
034 * see
035 * <A HREF=http://gate.ac.uk/GateExamples/doc/>http://gate.ac.uk/GateExamples/doc/</A>.</B>
036 *
037 * <P>
038 * This class provides examples of using the GATE APIs.
039 * Read this documentation along with a copy of the
040 * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source
041 * code</A>.
042 *
043 * <P>
044 * The CookBook is set up as
045 * part of the GATE test suite (using the
046 * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's
047 * an easy way to run the examples (viz.,
048 * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method,
049 * which will invoke the
050 * JUnit test runner). Also, we can use JUnit's assert methods: e.g.
051 * <TT>assertTrue(corpus.isEmpty());</TT>
052 * tests that a corpus object is empty, and creates a test failure report if
053 * this is not the case. (To add a new test class to the suite, see the
054 * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.)
055 *
056 * <P>
057 * Programming to the GATE Java API involves manipulating the classes and
058 * interfaces in the <A HREF=package-summary.html>gate package</A>
059 * (and to a lesser extent other packages). These are
060 * often interfaces; classes there are often to do with getting
061 * access to objects that implement the interfaces (without exposing those
062 * implementations). In other words, there's a lot of interface-based design
063 * around.
064 *
065 * <P>
066 * For more details and for a conceptual view, see
067 * <A HREF=http://gate.ac.uk/userguide/>Developing Language Processing
068 * Components with GATE</A> (for which this class provides some of the
069 * examples).
070 *
071 * <P>
072 * The rest of this documentation refers to methods in the code that
073 * provide examples of using the GATE API.
074 *
075 * <P>
076 * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives
077 * an example of creating a resource via
078 * <A HREF=../gate/Factory.html>gate.Factory</A>.
079 *
080 * <P>
081 * The <A HREF=Corpus.html>Corpus interface</A> represents collections of
082 * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER
083 * <TT>Collection</TT> class).
084 *
085 * <P>
086 * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method
087 * gives an example of how to create a new transient Corpus object.
088 *
089 * <P>
090 * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives
091 * examples of adding documents to corpora.
092 *
093 * <P>
094 * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives
095 * examples of adding annotations to documents.
096 *
097 *
098 * <P>
099 * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives
100 * examples of using features. <A HREF=FeatureMap.html>The FeatureMap
101 * interface</A> is a mechanism for associating arbitrary data with GATE
102 * entities. Corpora, documents and annotations all share this
103 * mechanism. Simple feature maps use Java's Map interface.
104 *
105 *
106 * <H3>Other sources of examples</H3>
107 *
108 * <P>
109 * See also the other test classes, although note that they also use methods
110 * that are not part of the public API. Test classes include:
111 * <A HREF=corpora/TestCreole.html>TestCreole</A>;
112 * <A HREF=corpora/TestCorpus.html>TestCorpus</A>;
113 * <A HREF=corpora/TestDocument.html>TestDocument</A>;
114 * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything
115 * else starting "Test" - about 30 of them at the last count.
116 */
117 public class CookBook extends TestCase
118 {
119 /** Debug flag */
120 private static final boolean DEBUG = false;
121
122 /** A corpus */
123 Corpus corpus = null;
124
125 /** A document */
126 Document doc1 = null;
127
128 /** Another document */
129 Document doc2 = null;
130
131 /** Constructing a resource */
132 public void testResourceCreation() throws GateException {
133
134 // before creating a resource we need a feature map to store
135 // parameter values
136 FeatureMap params = Factory.newFeatureMap();
137
138 // to create a document we need a sourceUrlName parameter giving
139 // the location of the source for the document content
140 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
141 Gate.getUrl("tests/doc0.html"));
142 params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME,
143 new Boolean(true));
144 Resource res = Factory.createResource("gate.corpora.DocumentImpl", params);
145
146 // now we have a document
147 assertTrue(
148 "should be document but the class is: " + res.getClass().getName(),
149 res instanceof gate.Document
150 );
151 Document doc = (Document) res;
152 AnnotationSet markupAnnotations = doc.getAnnotations(
153 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
154 //this is useless as doc.getAnnotations() will never return null!
155 assertNotNull("no markup annotations on doc " + doc, markupAnnotations);
156 int numMarkupAnnotations = markupAnnotations.size();
157 if(DEBUG)
158 Out.prln("annotations on doc after unpack= " + numMarkupAnnotations);
159 assertTrue(
160 "wrong number annots on doc: " + doc + numMarkupAnnotations,
161 numMarkupAnnotations == 20
162 );
163
164 } // testResourceCreation
165
166 /** Constructing a corpus */
167 public void testCorpusConstruction() throws GateException {
168
169 // corpus constructors require a name
170 corpus = Factory.newCorpus("My example corpus");
171
172 // the corpus interface inherits all the sorted set methods
173 assertTrue(corpus.isEmpty());
174
175 } // testCorpusConstruction
176
177 /** Adding documents to a corpus */
178 public void testAddingDocuments() throws GateException {
179
180 corpus = Factory.newCorpus("My example corpus");
181
182 // add a document or two....
183 corpus.add(doc1);
184 corpus.add(doc2);
185
186 // iterate the corpus members and do some random tests
187 Iterator iter = corpus.iterator();
188 while(iter.hasNext()) {
189 Document doc = (Document) iter.next();
190 assertTrue(
191 "document url not as expected",
192 doc.getSourceUrl().toExternalForm().endsWith("doc0.html") ||
193 doc.getSourceUrl().toExternalForm().endsWith("test1.htm")
194 );
195 } // while
196
197 } // testAddingDocuments
198
199 /** Adding annotations to documents */
200 public void testAddingAnnotations() {
201 AnnotationSet as = doc1.getAnnotations();
202 FeatureMap fm = doc1.getFeatures();
203 Integer id;
204
205 // during creation of annotations offsets are checked and an invalid
206 // offset exception thrown if they are invalid
207 try {
208 id = as.add(new Long(10), new Long(20), "T1", fm);
209 } catch (InvalidOffsetException e) {
210 fail(e.toString());
211 }
212 } // testAddingAnnotations
213
214 /** Using the FeatureMap interface */
215 public void testUsingFeatures() {
216 AnnotationSet as = doc1.getAnnotations();
217 Integer id; // the id of new annotations
218
219 // putting features on documents
220 FeatureMap fm = Factory.newFeatureMap();
221 doc1.setFeatures(fm);
222 assertTrue(fm.size() == 0);
223 fm.put("author", "segovia");
224 assertTrue(fm.get("author").equals("segovia"));
225 fm.put("author", "brendl"); // map puts overwrite existing values
226 assertTrue(fm.get("author").equals("brendl"));
227 assertTrue(fm.size() == 1);
228
229 } // testUsingFeatures
230
231 /** String to print when wrong command-line args */
232 private static String usage =
233 "usage: CookBook [-dir directory-name | file(s)]";
234
235 /**
236 * Main function: an example of embedding GATE-based
237 * batch processing. The method:
238 * <UL>
239 * <LI>
240 * initialises the GATE library, and creates PRs for
241 * tokenisation, sentence splitting and part of speech tagging
242 * <LI>
243 * takes a directory name as argument (-dir option) or just a list
244 * of files
245 * <LI>
246 * creates a directory called "out" and an index.html file there
247 * <LI>
248 * for each .html file in that directory:
249 * <BR> create a GATE document from the file
250 * <BR> run the PRs on the document
251 * <BR> dump some output for the file to "out/gate__[file name].txt",
252 * and add a line to the index
253 * </UL>
254 */
255 public static void main(String[] args) throws Exception {
256 // say "hi"
257 Out.prln("CookBook.main");
258 Out.prln("processing command line arguments");
259
260 // check we have a directory name or list of files
261 List inputFiles = null;
262 if(args.length < 1) throw new GateException(usage);
263
264 // set up a list of all the files to process
265 if(args[0].equals("-dir")) { // list all the files in the dir
266 if(args.length < 2) throw new GateException(usage);
267 File dir = new File(args[1]);
268 File[] filesArray = dir.listFiles();
269 if(filesArray == null)
270 throw new GateException(
271 dir.getPath() + " is not a directory; " + usage
272 );
273 inputFiles = Arrays.asList(filesArray);
274
275 } else { // all args should be file names
276 inputFiles = new ArrayList();
277 for(int i = 0; i < args.length; i++)
278 inputFiles.add(new File(args[i]));
279 }
280
281 // did we get some file names?
282 if(inputFiles.isEmpty()) {
283 throw new GateException("No files to process!");
284 }
285
286 // initialise GATE
287 Out.prln("initialising GATE");
288 Gate.init();
289
290 // create some processing resources
291 Out.prln("creating PRs");
292 //create a tokeniser
293 DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource(
294 "gate.creole.tokeniser.DefaultTokeniser");
295 //create a sentence splitter
296 SentenceSplitter splitter = (SentenceSplitter)Factory.createResource(
297 "gate.creole.splitter.SentenceSplitter");
298 //create a POS tagger
299 POSTagger tagger = (POSTagger)Factory.createResource(
300 "gate.creole.POSTagger");
301
302 //create a gazetteer
303 DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource(
304 "gate.creole.gazetteer.DefaultGazetteer");
305
306 //create a grammar
307 ANNIETransducer transducer = (ANNIETransducer)Factory.createResource(
308 "gate.creole.ANNIETransducer");
309
310 //create an orthomatcher
311 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
312 "gate.creole.orthomatcher.OrthoMatcher");
313
314 // make the "out" directory that will contain the results.
315 String outDirName =
316 ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out";
317 if(! new File(outDirName).mkdir()){
318 throw new GateException("Could not create the output directory");
319 }
320
321 // construct a name for the output index file; open; dump header
322 String nl = Strings.getNl(); // shorthand for platform's newline
323 String fsep =
324 Strings.getFileSep(); // shorthand for platform's file separator
325 String indexName =
326 ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html";
327 FileWriter indexWriter = new FileWriter(new File(indexName));
328 indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>");
329 indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl);
330
331 // main loop:
332 // for each document
333 // create a gate doc
334 // set as the document for the PRs
335 // run the PRs
336 // dump output from the doc to out/gate__.....txt
337 // delete the doc
338
339 // loop on files list
340 Iterator filesIter = inputFiles.iterator();
341 Out.prln("looping on input files list");
342 while(filesIter.hasNext()) {
343 File inFile = (File) filesIter.next(); // the current file
344 Out.prln("processing file " + inFile.getPath());
345 FeatureMap params = Factory.newFeatureMap(); // params list for new doc
346
347 // set the source URL parameter to a "file:..." URL string
348 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
349 inFile.toURI().toURL().toExternalForm());
350
351 // use the platform's default encoding rather than GATE's
352 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
353
354 // create the document
355 Document doc = (Document) Factory.createResource(
356 "gate.corpora.DocumentImpl", params
357 );
358
359 // set the document param on the PRs
360 tokeniser.setDocument(doc);
361 splitter.setDocument(doc);
362 tagger.setDocument(doc);
363 gazetteer.setDocument(doc);
364 transducer.setDocument(doc);
365 orthomatcher.setDocument(doc);
366
367 // run each PR
368 tokeniser.execute();
369 splitter.execute();
370 tagger.execute();
371 gazetteer.execute();
372 transducer.execute();
373 orthomatcher.execute();
374
375 // dump out results
376
377 // construct a name for the output file and open a stream
378 StringBuffer outFileName = new StringBuffer(inFile.getParent());
379 outFileName.append(fsep);
380 outFileName.append("out");
381 outFileName.append(fsep);
382 outFileName.append("gate__");
383 outFileName.append(inFile.getName());
384 outFileName.append(".txt");
385 File outFile = new File(outFileName.toString());
386 FileWriter outFileWriter = new FileWriter(outFile);
387 Out.prln("dumping " + outFile.getPath());
388
389 // iterate round the token annotations writing to the out file
390 // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens));
391 AnnotationSet tokens = doc.getAnnotations("nercAS").
392 get(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
393 Iterator<Annotation> iter = tokens.iterator();
394 while(iter.hasNext()) {
395 Annotation token = iter.next();
396 FeatureMap tokFeats = token.getFeatures();
397 String tokStr = (String) tokFeats.
398 get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
399 String tokPos = (String) tokFeats.
400 get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
401 outFileWriter.write(tokStr + "\t" + tokPos + nl);
402 }
403 outFileWriter.write(doc.getFeatures().get("entitySet").toString());
404
405 // close the out file stream; add an index line
406 outFileWriter.close();
407 indexWriter.write(
408 "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() +
409 "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() +
410 "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n"
411 );
412
413 // make the doc a candidate for garbage collection
414 Out.prln("deleting gate doc");
415
416 Factory.deleteResource(doc);
417 } // input files loop
418
419 // finish the index file
420 indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl);
421 indexWriter.close();
422
423 Out.prln("The End (roll credits)");
424 } // main
425
426 /** Fixture set up: initialise members before each test method */
427 public void setUp() throws GateException, IOException {
428 corpus = Factory.newCorpus("My example corpus");
429
430 doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html"));
431 doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm"));
432 } // setUp
433
434 /** Construction */
435 public CookBook(String name) { super(name); }
436
437 /** Test suite routine for the test runner */
438 public static Test suite() {
439 return new TestSuite(CookBook.class);
440 } // suite
441
442 } // class CookBook
|