001 /*
002 * RepositioningInfo.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Angel Kirilov, 04/January/2002
013 *
014 * $Id: RepositioningInfo.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.corpora;
018
019 import java.io.Serializable;
020 import java.util.ArrayList;
021
022 import gate.util.Out;
023
024 /**
025 * RepositioningInfo keep information about correspondence of positions
026 * between the original and extracted document content. With this information
027 * this class could be used for computing of this correspondence in the strict
028 * way (return -1 where is no correspondence)
029 * or in "flow" way (return near computable position)
030 */
031
032 public class RepositioningInfo extends ArrayList {
033
034 /** Freeze the serialization UID. */
035 static final long serialVersionUID = -2895662600168468559L;
036 /** Debug flag */
037 private static final boolean DEBUG = false;
038
039 /**
040 * Just information keeper inner class. No significant functionality.
041 */
042 public class PositionInfo implements Serializable {
043
044 /** Freeze the serialization UID. */
045 static final long serialVersionUID = -7747351720249898499L;
046
047 /** Data members for one peace of text information */
048 private long m_origPos, m_origLength, m_currPos, m_currLength;
049
050 /** The only constructor. We haven't set methods for data members. */
051 public PositionInfo(long orig, long origLen, long curr, long currLen) {
052 m_origPos = orig;
053 m_origLength = origLen;
054 m_currPos = curr;
055 m_currLength = currLen;
056 } // PositionInfo
057
058 /** Position in the extracted (and probably changed) content */
059 public long getCurrentPosition() {
060 return m_currPos;
061 } // getCurrentPosition
062
063 /** Position in the original content */
064 public long getOriginalPosition() {
065 return m_origPos;
066 } // getOriginalPosition
067
068 /** Length of peace of text in the original content */
069 public long getOriginalLength() {
070 return m_origLength;
071 } // getOriginalLength
072
073 /** Length of peace of text in the extracted content */
074 public long getCurrentLength() {
075 return m_currLength;
076 } // getCurrentLength
077
078 /** For debug purposes */
079 public String toString() {
080 return "("+m_origPos+","+m_origLength+","
081 +m_currPos+","+m_currLength+")";
082 } // toString
083 } // class PositionInfo
084
085 /** Default constructor */
086 public RepositioningInfo() {
087 super();
088 } // RepositioningInfo
089
090 /** Create a new position information record. */
091 public void addPositionInfo(long origPos, long origLength,
092 long currPos, long currLength) {
093 // sorted add of new position
094 int insertPos = 0;
095 PositionInfo lastPI;
096
097 for(int i = size(); i>0; i--) {
098 lastPI = (PositionInfo) get(i-1);
099 if(lastPI.getOriginalPosition() < origPos) {
100 insertPos = i;
101 break;
102 } // if - sort key
103 } // for
104
105 add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength));
106 } // addPositionInfo
107
108 /** Compute position in extracted content by position in the original content.
109 * If there is no correspondence return -1.
110 */
111 public long getExtractedPos(long absPos) {
112 long result = absPos;
113 PositionInfo currPI = null;
114 int size = size();
115
116 if(size != 0) {
117 long origPos, origLen;
118 boolean found = false;
119
120 for(int i=0; i<size; ++i) {
121 currPI = (PositionInfo) get(i);
122 origPos = currPI.getOriginalPosition();
123 origLen = currPI.getOriginalLength();
124
125 if(absPos <= origPos+origLen) {
126 if(absPos < origPos) {
127 // outside the range of information
128 result = -1;
129 }
130 else {
131 // current position + offset in this PositionInfo record
132 result = currPI.getCurrentPosition() + absPos - origPos;
133 } // if
134 found = true;
135 break;
136 } // if
137 } // for
138
139 if(!found) {
140 // after the last repositioning info
141 result = -1;
142 } // if - !found
143 } // if
144
145 return result;
146 } // getExtractedPos
147
148 public long getOriginalPos(long relPos) {
149 return getOriginalPos(relPos, false);
150 } // getOriginalPos
151
152 /** Compute position in original content by position in the extracted content.
153 * If there is no correspondence return -1.
154 */
155 public long getOriginalPos(long relPos, boolean afterChar) {
156 long result = relPos;
157 PositionInfo currPI = null;
158 int size = size();
159
160 if(size != 0) {
161 long currPos, currLen;
162 boolean found = false;
163
164 for(int i=0; i<size; ++i) {
165 currPI = (PositionInfo) get(i);
166 currPos = currPI.getCurrentPosition();
167 currLen = currPI.getCurrentLength();
168
169 if(afterChar && relPos == currPos+currLen) {
170 result = currPI.getOriginalPosition() + currPI.getOriginalLength();
171 found = true;
172 break;
173 } // if
174
175 if(relPos < currPos+currLen) {
176 if(relPos < currPos) {
177 // outside the range of information
178 result = -1;
179 }
180 else {
181 // current position + offset in this PositionInfo record
182 result = currPI.getOriginalPosition() + relPos - currPos;
183 } // if
184 found = true;
185 break;
186 } // if
187 } // for
188
189 if(!found) {
190 // after the last repositioning info
191 result = -1;
192 } // if - !found
193 } // if
194
195 return result;
196 } // getOriginalPos
197
198 /** Not finished yet */
199 public long getExtractedPosFlow(long absPos) {
200 long result = -1;
201 return result;
202 } // getExtractedPosFlow
203
204 /** Not finished yet */
205 public long getOriginalPosFlow(long relPos) {
206 long result = -1;
207 return result;
208 } // getOriginalPosFlow
209
210 /**
211 * Return the position info index containing <B>@param absPos</B>
212 * If there is no such position info return -1.
213 */
214 public int getIndexByOriginalPosition(long absPos) {
215 PositionInfo currPI = null;
216 int result = -1;
217
218 int size = size();
219 long origPos, origLen;
220
221 // Find with the liniear algorithm. Could be extended to binary search.
222 for(int i=0; i<size; ++i) {
223 currPI = (PositionInfo) get(i);
224 origPos = currPI.getOriginalPosition();
225 origLen = currPI.getOriginalLength();
226
227 if(absPos <= origPos+origLen) {
228 if(absPos >= origPos) {
229 result = i;
230 } // if
231 break;
232 } // if
233 } // for
234
235 return result;
236 } // getItemByOriginalPosition
237
238 /**
239 * Return the position info index containing <B>@param absPos</B>
240 * or the index of record before this position.
241 * Result is -1 if the position is before the first record.
242 * Rezult is size() if the position is after the last record.
243 */
244 public int getIndexByOriginalPositionFlow(long absPos) {
245 PositionInfo currPI = null;
246
247 int size = size();
248 int result = size;
249 long origPos, origLen;
250
251 // Find with the liniear algorithm. Could be extended to binary search.
252 for(int i=0; i<size; ++i) {
253 currPI = (PositionInfo) get(i);
254 origPos = currPI.getOriginalPosition();
255 origLen = currPI.getOriginalLength();
256
257 if(absPos <= origPos+origLen) {
258 // is inside of current record
259 if(absPos >= origPos) {
260 result = i;
261 }
262 else {
263 // not inside the current recort - return previous
264 result = i-1;
265 } // if
266 break;
267 } // if
268 } // for
269
270 return result;
271 } // getItemByOriginalPositionFlow
272
273 /**
274 * Correct the RepositioningInfo structure for shrink/expand changes.
275 * <br>
276 *
277 * Normaly the text peaces have same sizes in both original text and
278 * extracted text. But in some cases there are nonlinear substitutions.
279 * For example the sequence "<" is converted to "<".
280 * <br>
281 *
282 * The correction will split the corresponding PositionInfo structure to
283 * 3 new records - before correction, correction record and after correction.
284 * Front and end records are the same maner like the original record -
285 * m_origLength == m_currLength, since the middle record has different
286 * values because of shrink/expand changes. All records after this middle
287 * record should be corrected with the difference between these values.
288 * <br>
289 *
290 * All m_currPos above the current information record should be corrected
291 * with (origLen - newLen) i.e.
292 * <code> m_currPos -= origLen - newLen; </code>
293 * <br>
294 *
295 * @param originalPos Position of changed text in the original content.
296 * @param origLen Length of changed peace of text in the original content.
297 * @param newLen Length of new peace of text substiting the original peace.
298 */
299 public void correctInformation(long originalPos, long origLen, long newLen) {
300 PositionInfo currPI;
301 PositionInfo frontPI, correctPI, endPI;
302
303 int index = getIndexByOriginalPositionFlow(originalPos);
304
305 // correct the index when the originalPos precede all records
306 if(index == -1) {
307 index = 0;
308 } // if
309
310 // correction of all other information records
311 // All m_currPos above the current record should be corrected with
312 // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code>
313
314 for(int i=index; i<size(); ++i) {
315 currPI = (PositionInfo) get(i);
316 currPI.m_currPos -= origLen - newLen;
317 } // for
318
319 currPI = (PositionInfo) get(index);
320 if(originalPos >= currPI.m_origPos
321 && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) {
322 long frontLen = originalPos - currPI.m_origPos;
323
324 frontPI = new PositionInfo(currPI.m_origPos,
325 frontLen,
326 currPI.m_currPos,
327 frontLen);
328 correctPI = new PositionInfo(originalPos,
329 origLen,
330 currPI.m_currPos + frontLen,
331 newLen);
332 long endLen = currPI.m_origLength - frontLen - origLen;
333 endPI = new PositionInfo(originalPos + origLen,
334 endLen,
335 currPI.m_currPos + frontLen + newLen,
336 endLen);
337
338 set(index, frontPI); // substitute old element
339 if(endPI.m_origLength > 0) {
340 add(index+1, endPI); // insert new end element
341 } // if
342 if(correctPI.m_origLength > 0) {
343 add(index+1, correctPI); // insert middle new element
344 } // if
345 } // if - substitution range check
346 } // correctInformation
347
348 /**
349 * Correct the original position information in the records. When some text
350 * is shrinked/expanded by the parser. With this method is corrected the
351 * substitution of "\r\n" with "\n".
352 */
353 public void correctInformationOriginalMove(long originalPos, long moveLen) {
354 PositionInfo currPI;
355
356 if(DEBUG) {
357 if(originalPos < 380) // debug information restriction
358 Out.println("Before correction: "+this);
359 } // DEBUG
360
361 int index = getIndexByOriginalPositionFlow(originalPos);
362
363 // correct the index when the originalPos precede all records
364 if(index == -1) {
365 index = 0;
366 } // if
367
368 // position is after all records in list
369 if(index == size()) {
370 return;
371 } // if
372
373 for(int i = index+1; i<size(); ++i) {
374 currPI = (PositionInfo) get(i);
375 currPI.m_origPos += moveLen;
376 } // for
377
378 currPI = (PositionInfo) get(index);
379
380 // should we split this record to two new records (inside the record)
381 if(originalPos > currPI.m_origPos) {
382 if(originalPos < currPI.m_origPos + currPI.m_origLength) {
383 PositionInfo frontPI, endPI;
384 long frontLen = originalPos - currPI.m_origPos;
385 frontPI = new PositionInfo(currPI.m_origPos,
386 frontLen,
387 currPI.m_currPos,
388 frontLen);
389
390 long endLen = currPI.m_origLength - frontLen;
391 endPI = new PositionInfo(originalPos + moveLen,
392 endLen,
393 currPI.m_currPos + frontLen,
394 endLen);
395 set(index, frontPI); // substitute old element
396 if(endPI.m_origLength != 0) {
397 add(index+1, endPI); // insert new end element
398 } // if - should add this record
399
400 if(DEBUG) {
401 if(originalPos < 380) { // debug information restriction
402 Out.println("Point 2. Current: "+currPI);
403 Out.println("Point 2. frontPI: "+frontPI);
404 Out.println("Point 2. endPI: "+endPI);
405 }
406 } // DEBUG
407 } // if - inside the record
408 } // if
409 else {
410 // correction if the position is before the current record
411 currPI.m_origPos += moveLen;
412 }
413
414 if(DEBUG) {
415 if(originalPos < 380) {
416 Out.println("Correction move: "+originalPos+", "+moveLen);
417 Out.println("Corrected: "+this);
418 Out.println("index: "+index);
419 /*
420 Exception ex = new Exception();
421 Out.println("Call point: ");
422 ex.printStackTrace();
423 */
424 }
425 } // DEBUG
426 } // correctInformationOriginalMove
427
428 } // class RepositioningInfo
|