001 /*
002 * ProfilePRs.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Kalina Bontcheva, 04/10/2001
013 *
014 * $Id: ProfilePRs.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.creole;
018
019 import java.io.File;
020 import java.util.*;
021
022 import gate.*;
023 import gate.creole.gazetteer.DefaultGazetteer;
024 import gate.creole.orthomatcher.OrthoMatcher;
025 import gate.creole.splitter.SentenceSplitter;
026 import gate.creole.tokeniser.DefaultTokeniser;
027 import gate.util.GateException;
028 import gate.util.Out;
029 import gate.util.profile.Profiler;
030 //import java.text.NumberFormat;
031
032 /**
033 * This class provides a main function that:
034 * <UL>
035 * <LI>
036 * initialises the GATE library, and creates all PRs
037 * <LI>
038 * takes a directory name as argument
039 * <LI>
040 * for each .html file in that directory:
041 * <BR> create a GATE document from the file
042 * <BR> run the PRs on the document
043 * <BR> dump some statistics in the end
044 * </UL>
045 */
046 public class ProfilePRs {
047
048 /** String to print when wrong command-line args */
049 private static String usage =
050 "usage: ProfilePRs [-dir directory-name | file(s)]";
051
052 private static double totalDocLength = 0;
053 private static int docs = 0;
054 private static Profiler prof = new Profiler();
055 private static double maxDocLength = 0;
056
057 /** Main function */
058 public static void main(String[] args) throws Exception {
059 // say "hi"
060 Out.prln("processing command line arguments");
061
062 // check we have a directory name or list of files
063 List inputFiles = null;
064 if(args.length < 1) throw new GateException(usage);
065 if(args[0].equals("-dir")) { // list all the files in the dir
066 if(args.length < 2) throw new GateException(usage);
067 File dir = new File(args[1]);
068 File[] filesArray = dir.listFiles();
069 if(filesArray == null)
070 throw new GateException(
071 dir.getPath() + " is not a directory; " + usage
072 );
073 inputFiles = Arrays.asList(filesArray);
074 } else { // all args should be file names
075 inputFiles = new ArrayList();
076 for(int i = 0; i < args.length; i++)
077 inputFiles.add(new File(args[i]));
078 }
079
080 prof.initRun("Measuring performance on directory " + args[1]);
081 // prof.enable(false);
082 // prof.enableGCCalling(false);
083
084 // initialise GATE
085 prof.checkPoint("Before GATE.init()");
086 Gate.init();
087 //tell GATE we're in batch mode
088 // gate.Main.batchMode = true;
089
090
091 // create some processing resources
092 prof.checkPoint("Before creating the processing resources");
093
094 //create a default tokeniser
095 FeatureMap params = Factory.newFeatureMap();
096 DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource(
097 "gate.creole.tokeniser.DefaultTokeniser", params);
098 prof.checkPoint("Tokeniser initialised");
099
100 //create a default gazetteer
101 params = Factory.newFeatureMap();
102 DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource(
103 "gate.creole.gazetteer.DefaultGazetteer", params);
104 prof.checkPoint("Gazetteer initialised");
105
106 //create a splitter
107 params = Factory.newFeatureMap();
108 SentenceSplitter splitter = (SentenceSplitter) Factory.createResource(
109 "gate.creole.splitter.SentenceSplitter", params);
110 prof.checkPoint("Sentence splitter initialised");
111
112 //create a tagger
113 params = Factory.newFeatureMap();
114 POSTagger tagger = (POSTagger) Factory.createResource(
115 "gate.creole.POSTagger", params);
116 prof.checkPoint("POSTagger initialised");
117
118 //create a grammar
119 params = Factory.newFeatureMap();
120 ANNIETransducer transducer = (ANNIETransducer) Factory.createResource(
121 "gate.creole.ANNIETransducer", params);
122 prof.checkPoint("Grammars initialised");
123
124 //create an orthomatcher
125 params = Factory.newFeatureMap();
126 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
127 "gate.creole.orthomatcher.OrthoMatcher", params);
128 prof.checkPoint("Orthomatcher initialised");
129
130
131 // for each document
132 // create a gate doc
133 // set as the document for hte PRs
134 // run the PRs
135 // dump output from the doc
136 // delete the doc
137 Out.prln("\nLooping on input files list");
138 Iterator filesIter = inputFiles.iterator();
139 docs = inputFiles.size();
140 int fileNo=0;
141 while(filesIter.hasNext()) {
142 File inFile = (File) filesIter.next(); // the current file
143 fileNo++;
144
145 // set the source URL parameter to a "file:..." URL string
146 params.clear();
147 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, inFile.toURI().toURL().toExternalForm());
148 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
149
150 // create the document
151 Document doc = (Document) Factory.createResource(
152 "gate.corpora.DocumentImpl", params
153 );
154 totalDocLength += doc.getContent().size().longValue();
155
156 if (maxDocLength < doc.getContent().size().longValue())
157 maxDocLength = doc.getContent().size().longValue();
158
159 // set the document param on the PRs
160 tokeniser.setDocument(doc);
161 prof.checkPoint("Processing file " + inFile.getPath() +
162 ", #" + fileNo + "/" + docs, new String[0], true, false, false);
163 tokeniser.execute();
164 prof.checkPoint("", new String[] {"Tokenizer", "Processing"}, false, false, false);
165
166 //run gazetteer
167 gaz.setDocument(doc);
168 gaz.execute();
169 prof.checkPoint("", new String[] {"Gazettier", "Processing"}, false, false, false);
170
171 //run splitter
172 splitter.setDocument(doc);
173 splitter.execute();
174 prof.checkPoint("", new String[] {"Splitter", "Processing"}, false, false, false);
175
176 //run the tagger
177 tagger.setDocument(doc);
178 tagger.execute();
179 prof.checkPoint("", new String[] {"Tagger", "Processing"}, false, false, false);
180
181 //run the transducer
182 transducer.setDocument(doc);
183 transducer.execute();
184 prof.checkPoint("", new String[] {"JAPE grammars", "Processing"}, false, false, false);
185
186 // run the orthomatcher
187 orthomatcher.setDocument(doc);
188 orthomatcher.execute();
189 prof.checkPoint("", new String[] {"Orthomatcher", "Processing"}, false, false, false);
190
191 // make the doc a candidate for garbage collection
192 Factory.deleteResource(doc);
193
194 } // input files loop
195
196 prof.checkPoint("Done!");
197
198 totalDocLength = (double) totalDocLength/1024;
199 Out.prln("\nTotal KBytes processed: " + (long)totalDocLength);
200 Out.prln("\nMax document size in bytes: " + (long)maxDocLength +
201 " (" + (long) maxDocLength/1024 + " Kb)");
202
203
204 prof.printCategAvg("Processing", docs, totalDocLength, "kb");
205 prof.printCategAvg("Tokenizer", docs, totalDocLength, "kb");
206 prof.printCategAvg("Gazettier", docs, totalDocLength, "kb");
207 prof.printCategAvg("Splitter", docs, totalDocLength, "kb");
208 prof.printCategAvg("Tagger", docs, totalDocLength, "kb");
209 prof.printCategAvg("JAPE grammars", docs, totalDocLength, "kb");
210 prof.printCategAvg("Orthomatcher", docs, totalDocLength, "kb");
211 } // main
212
213
214 } // class ProfilePRs
|