001 /*
002 * CorpusSaver.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Kalina Bontcheva, 22/Nov/2001
013 *
014 * $Id: CorpusSaver.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.util;
018
019 import java.io.File;
020 import java.text.NumberFormat;
021 import java.util.*;
022
023 import gate.*;
024 import gate.creole.ExecutionException;
025 import gate.creole.ResourceInstantiationException;
026 import gate.gui.MainFrame;
027
028 public class CorpusSaver {
029
030 private static final boolean DEBUG = true;
031
032 public CorpusSaver() {
033 }
034
035 public void init() {
036 if (saveMode) {
037 File path = new File(dsPath);
038 try {
039 ds = Factory.openDataStore("gate.persist.SerialDataStore",
040 path.toURI().toURL().toString());
041 } catch (Exception ex) {
042 throw new gate.util.GateRuntimeException(ex.getMessage());
043 }
044
045 try {
046 Corpus corpus = Factory.newCorpus("bnc");
047 LanguageResource lr = ds.adopt(corpus, null);
048 ds.sync(lr);
049 theCorpus = (Corpus) lr;
050 } catch (Exception ex) {
051 throw new GateRuntimeException(ex.getMessage());
052 }
053 }
054
055 if (processMode)
056 initPRs();
057
058 }
059
060 public void initPRs() {
061 try {
062 if (applicationFile == null)
063 Out.prln("Application not set!");
064 Out.prln("App file is: " + applicationFile.getAbsolutePath());
065 application = (Controller) gate.util.persistence.PersistenceManager
066 .loadObjectFromFile(applicationFile);
067 } catch (Exception ex) {
068 throw new GateRuntimeException("Corpus Saver: "+ex.getMessage());
069 }
070 }//initPRs
071
072 public void execute() {
073 execute(startDir);
074 try {
075 if (saveMode) {
076 ds.sync(theCorpus);
077 Factory.deleteResource(theCorpus);
078 if (ds != null)
079 ds.close();
080 }
081 if (application != null) {
082 Iterator iter = new ArrayList(application.getPRs()).iterator();
083 while (iter.hasNext())
084 Factory.deleteResource((Resource) iter.next());
085 }
086 } catch (Exception ex) {
087 throw new GateRuntimeException(ex.getMessage());
088 }
089 }
090
091 public void execute(File dir) {
092 if (dir == null || (saveMode && ds == null))
093 return;
094 //first set the current directory to be the given one
095 currDir = dir;
096 Out.prln("Processing directory: " + currDir);
097
098 ArrayList files = new ArrayList();
099 ArrayList dirs = new ArrayList();
100 File[] dirArray = currDir.listFiles();
101 for (int i = 0; i < dirArray.length; i++) {
102 if (dirArray[i].isDirectory())
103 dirs.add(dirArray[i]);
104 else if (dirArray[i].isFile())
105 files.add(dirArray[i]);
106 }
107
108 saveFiles(files);
109
110 //if no more subdirs left, return
111 if (dirs.isEmpty())
112 return;
113
114 //there are more subdirectories to traverse, so iterate through
115 for (int j = 0; j < dirs.size(); j++)
116 execute((File) dirs.get(j));
117
118 }//execute(dir)
119
120
121 public static void main(String[] args) throws GateException {
122 Gate.init();
123
124 //MainFrame mFramew = new MainFrame();
125 //mFramew.setSize(800, 600);
126 //mFramew.setVisible(true);
127
128 CorpusSaver corpusSaver1 = new CorpusSaver();
129
130 if(args.length < 2)
131 throw new GateException("usage: [-process|-process_only] source_directory datastore_path application");
132 int i = 0;
133 while (i < args.length && args[i].startsWith("-")) {
134 if(args[i].equals("-process")) {
135 Out.prln("Processing and saving the corpus enabled. <P>");
136 corpusSaver1.setProcessMode(true);
137 } else if (args[i].equals("-process_only")) {
138 Out.prln("Processing only enabled. <P>");
139 corpusSaver1.setSaveMode(false);
140 corpusSaver1.setProcessMode(true);
141 }
142 i++; //just ignore the option, which we do not recognise
143 }//while
144
145 String dirName = args[i];
146 File dir = new File(dirName);
147 if (!dir.isDirectory())
148 throw new GateRuntimeException("Corpus directory should be "
149 + "provided as a parameter");
150 if(corpusSaver1.getSaveMode()){
151 i++;
152 if( i >= args.length)
153 throw new GateRuntimeException("Datastore path not provided");
154
155 if (corpusSaver1.getSaveMode()) {
156 String storagePath = args[i];
157 File storage = new File(storagePath);
158 if (!storage.isDirectory())
159 throw new GateRuntimeException("Please provide path to an existing "
160 + "GATE serial datastore");
161 corpusSaver1.setDSPath(storagePath);
162 }
163 }
164
165 //get the last argument which is the application
166 if (corpusSaver1.getProcessMode()) {
167 i++;
168 String appName = args[i];
169 File appFile = new File(appName);
170 if (!appFile.isFile())
171 throw new GateException("Please provide an existing GATE application");
172 else
173 corpusSaver1.setApplicationFile(appFile);
174 }
175
176 Out.prln("Initialising GATE please wait...");
177 corpusSaver1.init();
178 corpusSaver1.setStartDir(dir);
179 Out.prln("Processing...");
180 double timeBefore = System.currentTimeMillis();
181 corpusSaver1.execute();
182 double timeAfter = System.currentTimeMillis();
183 Out.prln("Done in " +
184 NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
185 + " seconds");
186
187 }
188
189 public void setStartDir(File newDir) {
190 startDir = newDir;
191 }
192
193 public void setProcessMode(boolean mode) {
194 processMode = mode;
195 }
196
197 public boolean getProcessMode() {
198 return processMode;
199 }
200
201 public void setSaveMode(boolean mode) {
202 saveMode = mode;
203 }
204
205 public boolean getSaveMode() {
206 return saveMode;
207 }
208
209 public void setDSPath(String path){
210 dsPath = path;
211 }
212
213 public void setApplicationFile(File newAppFile) {
214 applicationFile = newAppFile;
215 }
216
217
218 protected void saveFiles(List files) {
219 if (files==null || files.isEmpty() ||
220 (saveMode && (theCorpus == null || ds == null)))
221 return;
222
223 for(int i=0; i<files.size(); i++) {
224 try {
225 Document doc = Factory.newDocument(((File)files.get(i)).toURI().toURL());
226 doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURI().toURL().toString()));
227 Out.prln("Storing document: " + doc.getName());
228 //first process it with ANNIE if in process mode
229 if (processMode)
230 processDocument(doc);
231
232 //then store it in the DS and add to corpus
233 if (saveMode) {
234 LanguageResource lr = ds.adopt(doc, null);
235 theCorpus.add(lr);
236 theCorpus.unloadDocument( (Document) lr);
237
238 if (lr != doc)
239 Factory.deleteResource(lr);
240 }
241 Factory.deleteResource(doc);
242 } catch (Exception ex) {
243 throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
244 }
245 }//for
246 }//saveFiles
247
248 protected void processDocument(Document doc) {
249 try {
250 if (application instanceof CorpusController) {
251 Corpus tempCorpus = Factory.newCorpus("temp");
252 tempCorpus.add(doc);
253 ((CorpusController)application).setCorpus(tempCorpus);
254 application.execute();
255 Factory.deleteResource(tempCorpus);
256 tempCorpus = null;
257 } else {
258 Iterator iter = application.getPRs().iterator();
259 while (iter.hasNext())
260 ((ProcessingResource) iter.next()).setParameterValue("document", doc);
261 application.execute();
262 }
263 } catch (ResourceInstantiationException ex) {
264 throw new RuntimeException("Error executing application: "
265 + ex.getMessage());
266 } catch (ExecutionException ex) {
267 throw new RuntimeException("Error executing application: "
268 + ex.getMessage());
269 }
270 }
271
272
273 /**
274 * The directory from which we should generate/evaluate the corpus
275 */
276 private File startDir;
277 private File currDir;
278
279 private DataStore ds;
280 private Corpus theCorpus;
281 private String annotSetName = "NE";
282 private String dsPath = "d:\\bnc";
283 private Controller application = null;
284 private File applicationFile = null;
285
286 private boolean processMode = false;
287 private boolean saveMode = true;
288 }
|