001 /*
002 * DocumentContentImpl.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Hamish Cunningham, 11/Feb/2000
013 *
014 * $Id: DocumentContentImpl.java 13078 2010-09-15 10:31:37Z thomas_heitz $
015 */
016
017 package gate.corpora;
018
019 import java.io.*;
020 import java.net.URL;
021
022 import gate.DocumentContent;
023 import gate.util.BomStrippingInputStreamReader;
024 import gate.util.InvalidOffsetException;
025
026 /** Represents the commonalities between all sorts of document contents.
027 */
028 public class DocumentContentImpl implements DocumentContent
029 {
030 /** Debug flag */
031 private static final boolean DEBUG = false;
032
033 /** Buffer size for reading
034 * 16k is 4 times the block size on most filesystems
035 * so it should be efficient for most cases
036 * */
037 private static final int INTERNAL_BUFFER_SIZE = 16*1024;
038
039 /** Default construction */
040 public DocumentContentImpl() {
041 content = new String();
042 } // default construction
043
044 /** Contruction from URL and offsets. */
045 public DocumentContentImpl(URL u, String encoding, Long start, Long end)
046 throws IOException {
047
048 int readLength = 0;
049 char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
050
051 BufferedReader uReader = null;
052 StringBuffer buf = new StringBuffer();
053 char c;
054 long s = 0, e = Long.MAX_VALUE, counter = 0;
055 if(start != null && end != null) {
056 s = start.longValue();
057 e = end.longValue();
058 }
059
060 if(encoding != null && !encoding.equalsIgnoreCase("")) {
061 uReader = new BomStrippingInputStreamReader(u.openStream(), encoding, INTERNAL_BUFFER_SIZE);
062 } else {
063 uReader = new BomStrippingInputStreamReader(u.openStream(), INTERNAL_BUFFER_SIZE);
064 };
065
066 // 1. skip S characters
067 uReader.skip(s);
068
069 // 2. how many character shall I read?
070 long toRead = e - s;
071
072 // 3. read gtom source into buffer
073 while (
074 toRead > 0 &&
075 (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
076 ) {
077 if (toRead < readLength) {
078 //well, if toRead(long) is less than readLenght(int)
079 //then there can be no overflow, so the cast is safe
080 readLength = (int)toRead;
081 }
082
083 buf.append(readBuffer, 0, readLength);
084 toRead -= readLength;
085 }
086
087 // 4.close reader
088 uReader.close();
089
090 content = new String(buf);
091 originalContent = content;
092 } // Contruction from URL and offsets
093
094 /** Propagate changes to the document content. */
095 void edit(Long start, Long end, DocumentContent replacement)
096 {
097 int s = start.intValue(), e = end.intValue();
098 String repl = replacement == null ? "" :
099 ((DocumentContentImpl) replacement).content;
100 StringBuffer newContent = new StringBuffer(content);
101 newContent.replace(s, e, repl);
102 content = newContent.toString();
103 } // edit(start,end,replacement)
104
105 public DocumentContent getContent(Long start, Long end)
106 throws InvalidOffsetException
107 {
108 if(! isValidOffsetRange(start, end))
109 throw new InvalidOffsetException("Invalid offset range "+start+" to "+end+
110 " for document content of length "+this.size());
111
112 return new DocumentContentImpl(
113 content.substring(start.intValue(), end.intValue())
114 );
115 } // getContent(start, end)
116
117 /** Returns the String representing the content in case of a textual document.
118 * NOTE: this is a temporary solution until we have a more generic one.
119 */
120 public String toString(){
121 return content;
122 }
123
124 /** The size of this content (e.g. character length for textual
125 * content).
126 */
127 public Long size() {
128 return new Long(content.length());
129 } // size()
130
131 /** Check that an offset is valid */
132 boolean isValidOffset(Long offset) {
133 if(offset == null)
134 return false;
135
136 long o = offset.longValue();
137 long len = content.length();
138 if(o > len || o < 0)
139 return false;
140
141 return true;
142 } // isValidOffset
143
144 /** Check that both start and end are valid offsets and that
145 * they constitute a valid offset range
146 */
147 boolean isValidOffsetRange(Long start, Long end) {
148 return
149 isValidOffset(start) && isValidOffset(end) &&
150 start.longValue() <= end.longValue();
151 } // isValidOffsetRange(start,end)
152
153 /** Two documents are the same if their contents is the same
154 */
155 public boolean equals(Object other) {
156 if (!(other instanceof DocumentContentImpl)) return false;
157
158 DocumentContentImpl docImpl = (DocumentContentImpl) other;
159 return content.equals(docImpl.toString());
160 } // equals
161
162 /** Calculate the hash value for the object. */
163 public int hashCode(){ return toString().hashCode(); }
164
165 /** Just for now - later we have to cater for different types of
166 * content.
167 */
168 String content;
169
170 /**
171 * For preserving the original content of the document.
172 * The edit command didn't affect on the original content.
173 * If you construct the content by URL the originalContent will keep
174 * whole information retrieved by URL even you set some start and end.
175 */
176 String originalContent;
177
178 /**
179 * Return the original content of the document received during the loading
180 * phase or on construction from string.
181 */
182 public String getOriginalContent() { return originalContent; }
183
184 /** For ranges */
185 public DocumentContentImpl(String s)
186 { content = s; originalContent = content; }
187
188 /** Freeze the serialization UID. */
189 static final long serialVersionUID = -1426940535575467461L;
190 } // class DocumentContentImpl
|