001 /*
002 * Files.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * $Id: Files.java 12923 2010-08-04 15:05:49Z murfffi $
013 */
014
015 package gate.util;
016 import gate.Gate;
017 import gate.corpora.DocumentXmlUtils;
018
019 import java.io.*;
020 import java.util.*;
021 import java.util.regex.Matcher;
022 import java.util.regex.Pattern;
023 import java.net.URL;
024 import java.net.URI;
025 import java.net.URISyntaxException;
026 import java.nio.charset.Charset;
027 import java.nio.charset.CharsetDecoder;
028 import java.nio.charset.CodingErrorAction;
029 import java.nio.charset.CharacterCodingException;
030
031 import org.apache.commons.io.IOUtils;
032
033
034 /** Some utilities for use with Files and with resources.
035 * <P>
036 * <B>Note</B> that there is a terminology conflict between the use
037 * of "resources" here and <TT>gate.Resource</TT> and its inheritors.
038 * <P>
039 * Java "resources" are files that live on the CLASSPATH or in a Jar
040 * file that are <I>not</I> <TT>.class</TT> files. For example: a
041 * <TT>.gif</TT> file that is used by a GUI, or one of the XML files
042 * used for testing GATE's document format facilities. This class
043 * allows you to access these files in various ways (as streams, as
044 * byte arrays, etc.).
045 * <P>
046 * GATE resources are components (Java Beans) that provide all of the
047 * natural language processing capabilities of a GATE-based system, and
048 * the language data that such systems analsyse and produce. For
049 * example: parsers, lexicons, generators, corpora.
050 * <P>
051 * Where we say "resource" in this class we mean Java resource; elsewhere
052 * in the system we almost always mean GATE resource.
053 */
054 public class Files {
055
056 /** Debug flag */
057 private static final boolean DEBUG = false;
058
059 /** Used to generate temporary resources names*/
060 static long resourceIndex = 0;
061
062 /**Where on the classpath the gate resources are to be found*/
063 protected static String resourcePath = "/gate/resources";
064
065 /**Gets the path for the gate resources within the classpath*/
066 public static String getResourcePath(){
067 return resourcePath;
068 }
069
070 /** It returns the last component in a file path.
071 * It takes E.g: d:/tmp/file.txt and returns file.txt
072 */
073 public static String getLastPathComponent(String path){
074 if(path == null || path.length() == 0) return "";
075 //we should look both for "/" and "\" as on windows the file separator is"\"
076 //but a path coming from an URL will be separated by "/"
077 int index = path.lastIndexOf('/');
078 if(index == -1) index = path.lastIndexOf('\\');
079 if(index == -1) return path;
080 else return path.substring(index + 1);
081 }// getLastPathComponent()
082
083 /** Get a string representing the contents of a text file. */
084 public static String getString(String fileName) throws IOException {
085 return getString(new File(fileName));
086 } // getString(fileName)
087
088 /** Get a string representing the contents of a text file. */
089 public static String getString(File textFile) throws IOException {
090 FileInputStream fis = new FileInputStream(textFile);
091 int len = (int) textFile.length();
092 byte[] textBytes = new byte[len];
093 fis.read(textBytes, 0, len);
094 fis.close();
095 return new String(textBytes);
096 } // getString(File)
097
098 /** Get a byte array representing the contents of a binary file. */
099 public static byte[] getByteArray(File binaryFile) throws IOException {
100 FileInputStream fis = new FileInputStream(binaryFile);
101 int len = (int) binaryFile.length();
102 byte[] bytes = new byte[len];
103 fis.read(bytes, 0, len);
104 fis.close();
105 return bytes;
106 } // getByteArray(File)
107
108 /** Get a resource from the GATE ClassLoader as a String.
109 * @param resourceName The resource to input.
110 */
111 public static String getResourceAsString(String resourceName)
112 throws IOException {
113 return getResourceAsString(resourceName, null);
114 }
115
116 /** Get a resource from the GATE ClassLoader as a String.
117 * @param encoding The encoding of the reader used to input the file
118 * (may be null in which case the default encoding is used).
119 * @param resourceName The resource to input.
120 */
121 public static String
122 getResourceAsString(String resourceName, String encoding)
123 throws IOException
124 {
125 InputStream resourceStream = getResourceAsStream(resourceName);
126 if(resourceStream == null) return null;
127 BufferedReader resourceReader;
128 if(encoding == null) {
129 resourceReader = new BomStrippingInputStreamReader(resourceStream);
130 } else {
131 resourceReader = new BomStrippingInputStreamReader(resourceStream, encoding);
132 }
133 if(resourceReader == null) return null;
134 StringBuffer resourceBuffer = new StringBuffer();
135
136 int i;
137
138 int charsRead = 0;
139 final int size = 1024;
140 char[] charArray = new char[size];
141
142 while( (charsRead = resourceReader.read(charArray,0,size)) != -1 )
143 resourceBuffer.append (charArray,0,charsRead);
144
145 while( (i = resourceReader.read()) != -1 )
146 resourceBuffer.append((char) i);
147
148 resourceReader.close();
149 return resourceBuffer.toString();
150 } // getResourceAsString(String)
151
152 /** Get a resource from the GATE resources directory as a String.
153 * The resource name should be relative to <code>resourcePath</code> which
154 * is equal with <TT>gate/resources</TT>; e.g.
155 * for a resource stored as <TT>gate/resources/jape/Test11.jape</TT>,
156 * this method should be passed the name <TT>jape/Test11.jape</TT>.
157 */
158 public static String getGateResourceAsString(String resourceName)
159 throws IOException {
160 InputStream resourceStream = getGateResourceAsStream(resourceName);
161 if (resourceStream == null)
162 throw new IOException("No such resource on classpath: " + resourceName);
163 try {
164 return IOUtils.toString(resourceStream);
165 }
166 finally {
167 resourceStream.close();
168 }
169 } // getGateResourceAsString(String)
170
171 /**
172 * Writes a temporary file into the default temporary directory,
173 * form an InputStream a unique ID is generated and associated automaticaly
174 * with the file name...
175 */
176 public static File writeTempFile(InputStream contentStream)
177 throws IOException {
178
179 File resourceFile = null;
180 FileOutputStream resourceFileOutputStream = null;
181
182 // create a temporary file name
183 resourceFile = File.createTempFile ("gateResource", ".tmp");
184 resourceFileOutputStream = new FileOutputStream(resourceFile);
185 resourceFile.deleteOnExit ();
186
187 if (contentStream == null)
188 return resourceFile;
189
190 int bytesRead = 0;
191 final int readSize = 1024;
192 byte[] bytes = new byte[readSize];
193 while( (bytesRead = contentStream.read(bytes,0,readSize) ) != -1 )
194 resourceFileOutputStream.write(bytes,0, bytesRead);
195
196 resourceFileOutputStream.close();
197 contentStream.close ();
198 return resourceFile;
199 }// writeTempFile()
200
201 /**
202 * Writes aString into a temporary file located inside
203 * the default temporary directory defined by JVM, using the specific
204 * anEncoding.
205 * An unique ID is generated and associated automaticaly with the file name.
206 * @param aString the String to be written. If is null then the file will be
207 * empty.
208 * @param anEncoding the encoding to be used. If is null then the default
209 * encoding will be used.
210 * @return the tmp file containing the string.
211 */
212 public static File writeTempFile(String aString, String anEncoding) throws
213 UnsupportedEncodingException, IOException{
214 File resourceFile = null;
215 OutputStreamWriter writer = null;
216
217 // Create a temporary file name
218 resourceFile = File.createTempFile ("gateResource", ".tmp");
219 resourceFile.deleteOnExit ();
220
221 if (aString == null) return resourceFile;
222 // Prepare the writer
223 if (anEncoding == null){
224 // Use default encoding
225 writer = new OutputStreamWriter(new FileOutputStream(resourceFile));
226
227 }else {
228 // Use the specified encoding
229 writer = new OutputStreamWriter(
230 new FileOutputStream(resourceFile),anEncoding);
231 }// End if
232
233 // This Action is added only when a gate.Document is created.
234 // So, is for sure that the resource is a gate.Document
235 writer.write(aString);
236 writer.flush();
237 writer.close();
238 return resourceFile;
239 }// writeTempFile()
240
241 /**
242 * Writes aString into a temporary file located inside
243 * the default temporary directory defined by JVM, using the default
244 * encoding.
245 * An unique ID is generated and associated automaticaly with the file name.
246 * @param aString the String to be written. If is null then the file will be
247 * empty.
248 * @return the tmp file containing the string.
249 */
250 public static File writeTempFile(String aString) throws IOException{
251 return writeTempFile(aString,null);
252 }// writeTempFile()
253
254
255 /** Get a resource from the GATE ClassLoader as a byte array.
256 */
257 public static byte[] getResourceAsByteArray(String resourceName)
258 throws IOException, IndexOutOfBoundsException, ArrayStoreException {
259
260 InputStream resourceInputStream = getResourceAsStream(resourceName);
261 BufferedInputStream resourceStream =
262 new BufferedInputStream(resourceInputStream);
263 byte b;
264 final int bufSize = 1024;
265 byte[] buf = new byte[bufSize];
266 int i = 0;
267
268 // get the whole resource into buf (expanding the array as needed)
269 while( (b = (byte) resourceStream.read()) != -1 ) {
270 if(i == buf.length) {
271 byte[] newBuf = new byte[buf.length * 2];
272 System.arraycopy (buf,0,newBuf,0,i);
273 buf = newBuf;
274 }
275 buf[i++] = b;
276 }
277
278 // close the resource stream
279 resourceStream.close();
280
281 // copy the contents of buf to an array of the correct size
282 byte[] bytes = new byte[i];
283 // copy from buf to bytes
284 System.arraycopy (buf,0,bytes,0,i);
285 return bytes;
286 } // getResourceAsByteArray(String)
287
288 /** Get a resource from the GATE resources directory as a byte array.
289 * The resource name should be relative to <code>resourcePath<code> which
290 * is equal with <TT>gate/resources</TT>; e.g.
291 * for a resource stored as <TT>gate/resources/jape/Test11.jape</TT>,
292 * this method should be passed the name <TT>jape/Test11.jape</TT>.
293 */
294 public static byte[] getGateResourceAsByteArray(String resourceName)
295 throws IOException, IndexOutOfBoundsException, ArrayStoreException {
296
297 InputStream resourceInputStream = getGateResourceAsStream(resourceName);
298 BufferedInputStream resourceStream =
299 new BufferedInputStream(resourceInputStream);
300 byte b;
301 final int bufSize = 1024;
302 byte[] buf = new byte[bufSize];
303 int i = 0;
304
305 // get the whole resource into buf (expanding the array as needed)
306 while( (b = (byte) resourceStream.read()) != -1 ) {
307 if(i == buf.length) {
308 byte[] newBuf = new byte[buf.length * 2];
309 System.arraycopy (buf,0,newBuf,0,i);
310 buf = newBuf;
311 }
312 buf[i++] = b;
313 }
314
315 // close the resource stream
316 resourceStream.close();
317
318 // copy the contents of buf to an array of the correct size
319 byte[] bytes = new byte[i];
320
321 // copy from buf to bytes
322 System.arraycopy (buf,0,bytes,0,i);
323 return bytes;
324 } // getResourceGateAsByteArray(String)
325
326
327 /** Get a resource from the GATE ClassLoader as an InputStream.
328 */
329 public static InputStream getResourceAsStream(String resourceName)
330 throws IOException {
331 // Strip any leading '/'
332 if(resourceName.charAt(0) == '/') {
333 resourceName = resourceName.substring(1);
334 }
335
336 ClassLoader gcl = Gate.getClassLoader();
337 if(gcl == null) {
338 // if the GATE ClassLoader has not been initialised yet (i.e. this
339 // method was called before Gate.init) then fall back to the current
340 // classloader
341 return Files.class.getClassLoader().getResourceAsStream(resourceName);
342 }
343 else {
344 // if we can, get the resource through the GATE ClassLoader to allow
345 // loading of resources from plugin JARs as well as gate.jar
346 return gcl.getResourceAsStream(resourceName);
347 }
348 //return ClassLoader.getSystemResourceAsStream(resourceName);
349 } // getResourceAsStream(String)
350
351 /** Get a resource from the GATE resources directory as an InputStream.
352 * The resource name should be relative to <code>resourcePath<code> which
353 * is equal with <TT>gate/resources</TT>; e.g.
354 * for a resource stored as <TT>gate/resources/jape/Test11.jape</TT>,
355 * this method should be passed the name <TT>jape/Test11.jape</TT>.
356 */
357 public static InputStream getGateResourceAsStream(String resourceName)
358 throws IOException {
359
360 if(resourceName.startsWith("/") || resourceName.startsWith("\\") )
361 return getResourceAsStream(resourcePath + resourceName);
362 else return getResourceAsStream(resourcePath + "/" + resourceName);
363 } // getResourceAsStream(String)
364
365 /**
366 * Get a resource from the GATE ClassLoader. The return value is a
367 * {@link java.net.URL} that can be used to retrieve the contents of the
368 * resource.
369 */
370 public static URL getResource(String resourceName) {
371 // Strip any leading '/'
372 if(resourceName.charAt(0) == '/') {
373 resourceName = resourceName.substring(1);
374 }
375
376 ClassLoader gcl = Gate.getClassLoader();
377 if(gcl == null) {
378 // if the GATE ClassLoader has not been initialised yet (i.e. this
379 // method was called before Gate.init) then fall back to the current
380 // classloader
381 return Files.class.getClassLoader().getResource(resourceName);
382 }
383 else {
384 // if we can, get the resource through the GATE ClassLoader to allow
385 // loading of resources from plugin JARs as well as gate.jar
386 return gcl.getResource(resourceName);
387 }
388 }
389
390 /**
391 * Get a resource from the GATE resources directory. The return value is a
392 * {@link java.net.URL} that can be used to retrieve the contents of the
393 * resource.
394 * The resource name should be relative to <code>resourcePath<code> which
395 * is equal with <TT>gate/resources</TT>; e.g.
396 * for a resource stored as <TT>gate/resources/jape/Test11.jape</TT>,
397 * this method should be passed the name <TT>jape/Test11.jape</TT>.
398 */
399 public static URL getGateResource(String resourceName) {
400 if(resourceName.startsWith("/") || resourceName.startsWith("\\") )
401 return getResource(resourcePath + resourceName);
402 else return getResource(resourcePath + "/" + resourceName);
403 }
404
405 /**
406 * This method takes a regular expression and a directory name and returns
407 * the set of Files that match the pattern under that directory.
408 *
409 * @param regex regular expression path that begins with <code>pathFile</code>
410 * @param pathFile directory path where to search for files
411 * @return set of file paths under <code>pathFile</code> that matches
412 * <code>regex</code>
413 */
414 public static Set<String> Find(String regex, String pathFile) {
415 Set<String> regexfinal = new HashSet<String>();
416 String[] tab;
417 File file = null;
418
419 //open a file
420 try {
421 file = new File(pathFile);
422 } catch(NullPointerException npe) {
423 npe.printStackTrace(Err.getPrintWriter());
424 }
425
426 Pattern pattern = Pattern.compile("^"+regex);
427
428 if (file.isDirectory()){
429 tab = file.list();
430 for (int i=0;i<=tab.length-1;i++){
431 String finalPath = pathFile+"/"+tab[i];
432 Matcher matcher = pattern.matcher(finalPath);
433 if (matcher.matches()){
434 regexfinal.add(finalPath);
435 }
436 }
437 }
438 else {
439 if (file.isFile()){
440 Matcher matcher = pattern.matcher(pathFile);
441 if (matcher.matches()){
442 regexfinal.add(pathFile);
443 }
444 }
445 }
446
447 return regexfinal;
448 } //find
449
450 /** Recursively remove a directory <B>even if it contains other files
451 * or directories</B>. Returns true when the directory and all its
452 * contents are successfully removed, else false.
453 */
454 public static boolean rmdir(File dir) {
455 if(dir == null || ! dir.isDirectory()) // only delete directories
456 return false;
457
458 // list all the members of the dir
459 String[] members = dir.list();
460
461 // return value indicating success or failure
462 boolean succeeded = true;
463
464 // for each member, if is dir then recursively delete; if file then delete
465 for(int i = 0; i<members.length; i++) {
466 File member = new File(dir, members[i]);
467
468 if(member.isFile()) {
469 if(! member.delete())
470 succeeded = false;
471 } else {
472 if(! Files.rmdir(member))
473 succeeded = false;
474 }
475 }
476
477 // delete the directory itself
478 dir.delete();
479
480 // return status value
481 return succeeded;
482 } // rmdir(File)
483
484 /**
485 * This method updates an XML element with a new set of attributes.
486 * If the element is not found the XML is unchanged. The attributes
487 * keys and values must all be Strings.
488 *
489 * @param xml A stream of the XML data.
490 * @param elementName The name of the element to update.
491 * @param newAttrs The new attributes to place on the element.
492 * @return A string of the whole XML source, with the element updated.
493 */
494 public static String updateXmlElement(
495 BufferedReader xml, String elementName, Map newAttrs
496 ) throws IOException {
497 String line = null;
498 String nl = Strings.getNl();
499 StringBuffer newXml = new StringBuffer();
500
501 // read the whole source
502 while( ( line = xml.readLine() ) != null ) {
503 newXml.append(line);
504 newXml.append(nl);
505 }
506
507 // find the location of the element
508 int start = newXml.toString().indexOf("<" + elementName);
509 if(start == -1) return newXml.toString();
510 int end = newXml.toString().indexOf(">", start);
511 if(end == -1) return newXml.toString();
512
513 // check if the old element is empty (ends in "/>") or not
514 boolean isEmpty = false;
515 if(newXml.toString().charAt(end - 1) == '/') isEmpty = true;
516
517 // create the new element string with the new attributes
518 StringBuffer newElement = new StringBuffer();
519 newElement.append("<");
520 newElement.append(elementName);
521
522 // add in the new attributes
523 Iterator iter = newAttrs.entrySet().iterator();
524 while(iter.hasNext()) {
525 Map.Entry entry = (Map.Entry) iter.next();
526 String key = (String) entry.getKey();
527 String value = (String) entry.getValue();
528
529 newElement.append(" ");
530 newElement.append(DocumentXmlUtils.combinedNormalisation(key));
531 newElement.append("=\"");
532 newElement.append(DocumentXmlUtils.combinedNormalisation(value));
533 newElement.append("\"" + nl);
534 }
535
536 // terminate the element
537 if(isEmpty) newElement.append("/");
538 newElement.append(">");
539
540 // replace the old string
541 newXml.replace(start, end + 1, newElement.toString());
542
543 return newXml.toString();
544 } // updateXmlElement(Reader...)
545
546 /**
547 * This method updates an XML element in an XML file
548 * with a new set of attributes. If the element is not found the XML
549 * file is unchanged. The attributes keys and values must all be Strings.
550 * We first try to read the file using UTF-8 encoding. If an error occurs we
551 * fall back to the platform default encoding (for backwards-compatibility
552 * reasons) and try again. The file is written back in UTF-8, with an
553 * updated encoding declaration.
554 *
555 * @param xmlFile An XML file.
556 * @param elementName The name of the element to update.
557 * @param newAttrs The new attributes to place on the element.
558 * @return A string of the whole XML file, with the element updated (the
559 * file is also overwritten).
560 */
561 public static String updateXmlElement(
562 File xmlFile, String elementName, Map newAttrs
563 ) throws IOException {
564 String newXml = null;
565 BufferedReader utfFileReader = null;
566 BufferedReader platformFileReader = null;
567 Charset utfCharset = Charset.forName("UTF-8");
568 try {
569 FileInputStream fis = new FileInputStream(xmlFile);
570 // try reading with UTF-8, make sure any errors throw an exception
571 CharsetDecoder decoder = utfCharset.newDecoder()
572 .onUnmappableCharacter(CodingErrorAction.REPORT)
573 .onMalformedInput(CodingErrorAction.REPORT);
574 utfFileReader = new BomStrippingInputStreamReader(fis, decoder);
575 newXml = updateXmlElement(utfFileReader, elementName, newAttrs);
576 }
577 catch(CharacterCodingException cce) {
578 // File not readable as UTF-8, so try the platform default encoding
579 if(utfFileReader != null) {
580 utfFileReader.close();
581 utfFileReader = null;
582 }
583 if(DEBUG) {
584 Err.prln("updateXmlElement: could not read " + xmlFile + " as UTF-8, "
585 + "trying platform default");
586 }
587 platformFileReader = new BufferedReader(new FileReader(xmlFile));
588 newXml = updateXmlElement(platformFileReader, elementName, newAttrs);
589 }
590 finally {
591 if(utfFileReader != null) {
592 utfFileReader.close();
593 }
594 if(platformFileReader != null) {
595 platformFileReader.close();
596 }
597 }
598
599 // write the updated file in UTF-8, fixing the encoding declaration
600 newXml = newXml.replaceFirst(
601 "\\A<\\?xml (.*)encoding=(?:\"[^\"]*\"|'[^']*')",
602 "<?xml $1encoding=\"UTF-8\"");
603 FileOutputStream fos = new FileOutputStream(xmlFile);
604 OutputStreamWriter fileWriter = new OutputStreamWriter(fos, utfCharset);
605 fileWriter.write(newXml);
606 fileWriter.close();
607
608 return newXml;
609 } // updateXmlElement(File...)
610
611
612 /**
613 * Convert a file: URL to a <code>java.io.File</code>. First tries to parse
614 * the URL's toExternalForm as a URI and create the File object from that
615 * URI. If this fails, just uses the path part of the URL. This handles
616 * URLs that contain spaces or other unusual characters, both as literals and
617 * when encoded as (e.g.) %20.
618 *
619 * @exception IllegalArgumentException if the URL is not convertable into a
620 * File.
621 */
622 public static File fileFromURL(URL theURL) throws IllegalArgumentException {
623 try {
624 URI uri = new URI(theURL.toExternalForm());
625 return new File(uri);
626 }
627 catch(URISyntaxException use) {
628 try {
629 URI uri = new URI(theURL.getProtocol(), null, theURL.getPath(), null, null);
630 return new File(uri);
631 }
632 catch(URISyntaxException use2) {
633 throw new IllegalArgumentException("Cannot convert " + theURL + " to a file path");
634 }
635 }
636 }
637
638 /**
639 * Same as {@link java.io.File#listFiles(java.io.FileFilter)}
640 * but recursive on directories.
641 * @param directory file path to start the search, will not be include
642 * in the results
643 * @param filter filter apply to the search
644 * @return an array of files (including directories) contained inside
645 * <code>directory</code>. The array will be empty if the directory is
646 * empty. Returns null if this abstract pathname does not denote a
647 * directory, or if an I/O error occurs.
648 */
649 public static File[] listFilesRecursively(File directory, FileFilter filter) {
650 List<File> filesList = new ArrayList<File>();
651
652 File[] filesRootArray = directory.listFiles(filter);
653 if (filesRootArray == null) { return null; }
654
655 for (File file : filesRootArray) {
656 filesList.add(file);
657 if (file.isDirectory()) {
658 File[] filesDeepArray = listFilesRecursively(file, filter);
659 if (filesDeepArray == null) { return null; }
660 filesList.addAll(Arrays.asList(filesDeepArray));
661 }
662 }
663
664 return filesList.toArray(new File[filesList.size()]);
665 }
666
667 } // class Files
|