001 /*
002 * DumpingPR.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Kalina Bontcheva, 19/10/2001
013 *
014 * $Id: DumpingPR.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.creole.dumpingPR;
018
019 import java.io.*;
020 import java.net.URL;
021 import java.util.*;
022
023 import gate.*;
024 import gate.corpora.DocumentImpl;
025 import gate.creole.*;
026 import gate.util.*;
027
028 /**
029 * This class implements a DumpingPR which exports a given set of annotation
030 * types + the original markup, back into the document's native format.
031 * The export might also include the GATE features of those annotations or
032 * not (the default). One can also control whether the export files have a
033 * new suffix (useSuffixForDumpFiles) and what this suffix is
034 * (suffixForDumpFiles). By default, a suffix is used and it is .gate.
035 */
036 public class DumpingPR extends AbstractLanguageAnalyser
037 implements ProcessingResource {
038
039 public static final String
040 DPR_DOCUMENT_PARAMETER_NAME = "document";
041
042 public static final String
043 DPR_ANN_SET_PARAMETER_NAME = "annotationSetName";
044
045 public static final String
046 DPR_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
047
048 public static final String
049 DPR_DUMP_TYPES_PARAMETER_NAME = "dumpTypes";
050
051 public static final String
052 DPR_OUTPUT_URL_PARAMETER_NAME = "outputDirectoryUrl";
053
054 public static final String
055 DPR_INCLUDE_FEAT_PARAMETER_NAME = "includeFeatures";
056
057 public static final String
058 DPR_USE_SUFFIX_PARAMETER_NAME = "useSuffixForDumpFiles";
059
060 public static final String
061 DPR_FILE_SUFFIX_PARAMETER_NAME = "suffixForDumpFiles";
062
063 private static final boolean DEBUG = true;
064
065 /**
066 * A list of annotation types, which are to be dumped into the output file
067 */
068 protected List annotationTypes;
069
070 /**
071 * A list of strings specifying new names to be used instead of the original
072 * annotation types given in the annotationTypes parameter. For example, if
073 * annotationTypes was set to [Location, Date], then if dumpTypes is set to
074 * [Place, Date-expr], then the labels <Place> and <Date-expr> will be inserted
075 * instead of <Location> and <Date>.
076 */
077 protected List dumpTypes;
078
079 /**the name of the annotation set
080 * from which to take the annotations for dumping
081 */
082 protected String annotationSetName;
083
084 /**
085 * Whether or not to include the annotation features during export
086 */
087 protected boolean includeFeatures = false;
088
089 /**
090 * Whether or not to include the annotation features during export
091 */
092 protected boolean useStandOffXML = false;
093
094 /**
095 * What suffix to use for the dump files. .gate by default, but can be
096 * changed via the set method.
097 */
098 protected String suffixForDumpFiles = ".gate";
099
100 /**
101 * Whether or not to use the special suffix fo the dump files. True by
102 * default.
103 */
104 protected boolean useSuffixForDumpFiles = true;
105
106 protected java.net.URL outputDirectoryUrl;
107
108 private static final String DUMPING_PR_SET = "DumpingPRTempSet";
109
110 /** Initialise this resource, and return it. */
111 public Resource init() throws ResourceInstantiationException
112 {
113 return super.init();
114 } // init()
115
116 /**
117 * Reinitialises the processing resource. After calling this method the
118 * resource should be in the state it is after calling init.
119 * If the resource depends on external resources (such as rules files) then
120 * the resource will re-read those resources. If the data used to create
121 * the resource has changed since the resource has been created then the
122 * resource will change too after calling reInit().
123 */
124 public void reInit() throws ResourceInstantiationException
125 {
126 init();
127 } // reInit()
128
129 /** Run the resource. */
130 public void execute() throws ExecutionException {
131 if(document == null)
132 throw new GateRuntimeException("No document to process!");
133
134 //if we're saving into standOffXML, then the rest of the settings do
135 //not matter because that toXML method saves everything
136 if (this.useStandOffXML) {
137 write2File();
138 return;
139 }
140
141 AnnotationSet allAnnots;
142 // get the annotations from document
143 if ((annotationSetName == null)|| (annotationSetName.equals("")))
144 allAnnots = document.getAnnotations();
145 else
146 allAnnots = document.getAnnotations(annotationSetName);
147
148 //if none found, print warning and exit
149 if ((allAnnots == null) || allAnnots.isEmpty()) {
150 Out.prln("DumpingPR Warning: No annotations found for export. "
151 + "Including only those from the Original markups set.");
152 write2File(null);
153 return;
154 }
155
156 //if we're saving into standOffXML, then the rest of the settings do
157 //not matter because that toXML method saves everything
158 if (this.useStandOffXML) {
159 write2File();
160 return;
161 }
162
163 //first transfer the annotation types from a list to a set
164 //don't I just hate this!
165 Set types2Export = new HashSet(annotationTypes);
166
167 //then get the annotations for export
168 AnnotationSet annots2Export = allAnnots.get(types2Export);
169
170 //check whether we want the annotations to be renamed before
171 //export (that's what dumpTypes is for)
172 if (dumpTypes != null && !dumpTypes.isEmpty()) {
173 HashMap renameMap = new HashMap();
174 for(int i=0; i<dumpTypes.size() && i<annotationTypes.size(); i++) {
175 //check if we have a corresponding annotationType and if yes,
176 //then add to the hash map for renaming
177 renameMap.put(annotationTypes.get(i), dumpTypes.get(i));
178 }//for
179 //if we have to rename annotations, then do so
180 if(!renameMap.isEmpty() && annots2Export != null)
181 annots2Export = renameAnnotations(annots2Export, renameMap);
182 }//if
183
184 write2File(annots2Export);
185 document.removeAnnotationSet(DumpingPR.DUMPING_PR_SET);
186
187 } // execute()
188
189 protected void write2File(AnnotationSet exportSet) {
190 File outputFile;
191 String fileName = null;
192 if(document.getSourceUrl() == null)
193 fileName = document.getName() + "_" + Gate.genSym();
194 else
195 fileName = getFileName(document.getSourceUrl());
196
197 fileName = getNewFileName(outputDirectoryUrl, fileName);
198 StringBuffer tempBuff = new StringBuffer(fileName);
199 //now append the special suffix if we want to use it
200 if (useSuffixForDumpFiles)
201 tempBuff.append(this.suffixForDumpFiles);
202
203 String outputPath = tempBuff.toString();
204
205 if (DEBUG)
206 Out.prln(outputPath);
207 outputFile = new File(outputPath);
208
209 try {
210 // Prepare to write into the xmlFile using the doc's encoding if there
211 OutputStreamWriter writer;
212 if (document instanceof DocumentImpl) {
213 String encoding = ((DocumentImpl) document).getEncoding();
214 if (encoding == null || "".equals(encoding))
215 writer = new OutputStreamWriter(new FileOutputStream(outputFile));
216 else
217 writer = new OutputStreamWriter(
218 new FileOutputStream(outputFile), encoding);
219 } else
220 writer = new OutputStreamWriter(
221 new FileOutputStream(outputFile));
222
223 // Write (test the toXml() method)
224 // This Action is added only when a gate.Document is created.
225 // So, is for sure that the resource is a gate.Document
226 writer.write(document.toXml(exportSet, includeFeatures));
227 writer.flush();
228 writer.close();
229 } catch (IOException ex) {
230 throw new GateRuntimeException("Dumping PR: Error writing document "
231 + document.getName() + ": "
232 + ex.getMessage());
233 }
234
235
236 }//write2File
237
238 protected void write2File() {
239 File outputFile;
240 String fileName = null;
241 if(document.getSourceUrl() == null)
242 fileName = document.getName() + "_" + Gate.genSym();
243 else
244 fileName = getFileName(document.getSourceUrl());
245
246 fileName = getNewFileName(outputDirectoryUrl, fileName);
247 StringBuffer tempBuff = new StringBuffer(fileName);
248 //now append the special suffix if we want to use it
249 if (useSuffixForDumpFiles)
250 tempBuff.append(this.suffixForDumpFiles);
251 String outputPath = tempBuff.toString();
252 if (DEBUG)
253 Out.prln(outputPath);
254 outputFile = new File(outputPath);
255
256 try {
257 // Prepare to write into the xmlFile using the doc's encoding if there
258 OutputStreamWriter writer;
259 if (document instanceof DocumentImpl) {
260 String encoding = ((DocumentImpl) document).getEncoding();
261 if (encoding == null || "".equals(encoding))
262 writer = new OutputStreamWriter(new FileOutputStream(outputFile));
263 else
264 writer = new OutputStreamWriter(
265 new FileOutputStream(outputFile), encoding);
266 } else
267 writer = new OutputStreamWriter(
268 new FileOutputStream(outputFile));
269
270 // Write (test the toXml() method)
271 // This Action is added only when a gate.Document is created.
272 // So, is for sure that the resource is a gate.Document
273 writer.write(document.toXml());
274 writer.flush();
275 writer.close();
276 } catch (IOException ex) {
277 throw new GateRuntimeException("Dumping PR: Error writing document "
278 + document.getName() + ": "
279 + ex.getMessage());
280 }
281
282
283 }//write2File
284
285
286 protected String getFileName(URL url) {
287 String fileName = url.getFile();
288 int index = fileName.lastIndexOf("/");
289 if(index == -1) index = fileName.lastIndexOf("\\");
290 if(index == -1)
291 return fileName;
292 else {
293 if(index + 1 == fileName.length()) {
294 fileName = fileName.substring(0, fileName.length()-1);
295 index = fileName.lastIndexOf("/");
296 if(index == -1) index = fileName.lastIndexOf("\\");
297 if(index == -1) return fileName;
298 }
299 fileName = fileName.substring(index+1, fileName.length());
300 }
301 return fileName;
302 }
303
304 protected String getNewFileName(URL dir, String file) {
305 return new File((dir == null) ?
306 new File(System.getProperty("java.io.tmpdir")) : Files.fileFromURL(dir),
307 file).getAbsolutePath();
308 }
309
310 protected AnnotationSet renameAnnotations(AnnotationSet annots2Export,
311 HashMap renameMap){
312 Iterator<Annotation> iter = annots2Export.iterator();
313 AnnotationSet as = document.getAnnotations(DUMPING_PR_SET);
314 if (!as.isEmpty())
315 as.clear();
316 while(iter.hasNext()) {
317 Annotation annot = iter.next();
318 //first check whether this type needs to be renamed
319 //if not, continue
320 if (!renameMap.containsKey(annot.getType()))
321 renameMap.put(annot.getType(), annot.getType());
322 try{
323 as.add(annot.getId(),
324 annot.getStartNode().getOffset(),
325 annot.getEndNode().getOffset(),
326 (String) renameMap.get(annot.getType()),
327 annot.getFeatures());
328 } catch (InvalidOffsetException ex) {
329 throw new GateRuntimeException("DumpingPR: " + ex.getMessage());
330 }
331 }//while
332 return as;
333 }//renameAnnotations
334
335
336 /**get the name of the annotation set*/
337 public String getAnnotationSetName() {
338 return annotationSetName;
339 }//getAnnotationSetName
340
341 /** set the annotation set name*/
342 public void setAnnotationSetName(String newAnnotationSetName) {
343 this.annotationSetName = newAnnotationSetName;
344 }//setAnnotationSetName
345
346 public List getAnnotationTypes() {
347 return this.annotationTypes;
348 }
349
350 public void setAnnotationTypes(List newTypes) {
351 this.annotationTypes = newTypes;
352 }
353
354 public List getDumpTypes() {
355 return this.dumpTypes;
356 }
357
358 public void setDumpTypes(List newTypes) {
359 dumpTypes = newTypes;
360 }
361
362 public URL getOutputDirectoryUrl() {
363 return this.outputDirectoryUrl;
364 }
365
366 public void setOutputDirectoryUrl(URL file) {
367 this.outputDirectoryUrl = file;
368 }
369
370 public void setIncludeFeatures(Boolean inclFeatures) {
371 if (inclFeatures != null)
372 includeFeatures = inclFeatures.booleanValue();
373 }
374
375 public Boolean getIncludeFeatures() {
376 return new Boolean(includeFeatures);
377 }
378
379 public void setUseStandOffXML(Boolean newValue) {
380 if (newValue != null)
381 useStandOffXML = newValue.booleanValue();
382 }
383
384 public Boolean getUseStandOffXML() {
385 return new Boolean(useStandOffXML);
386 }
387
388 public String getSuffixForDumpFiles() {
389 return suffixForDumpFiles;
390 }
391
392 public void setSuffixForDumpFiles(String newSuffix) {
393 this.suffixForDumpFiles = newSuffix;
394 }
395
396 public Boolean getUseSuffixForDumpFiles() {
397 return new Boolean(this.useSuffixForDumpFiles);
398 }
399
400 public void setUseSuffixForDumpFiles(Boolean useOrNot) {
401 if (useOrNot != null)
402 this.useSuffixForDumpFiles = useOrNot.booleanValue();
403 }
404
405 } // class DumpingPR
|