01 /*
02 * XmlPositionCorrectionHandler.java
03 *
04 * Copyright (c) 1995-2010, The University of Sheffield. See the file
05 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
06 *
07 * This file is part of GATE (see http://gate.ac.uk/), and is free
08 * software, licenced under the GNU Library General Public License,
09 * Version 2, June 1991 (in the distribution as file licence.html,
10 * and also available at http://gate.ac.uk/gate/licence.html).
11 *
12 * Angel Kirilov, 4 January 2002
13 *
14 * $Id: XmlPositionCorrectionHandler.java 12006 2009-12-01 17:24:28Z thomas_heitz $
15 */
16
17 package gate.xml;
18
19 import org.xml.sax.helpers.DefaultHandler;
20
21
22 /**
23 * This class correct a Xerces parser bug in reported position in file during
24 * the parsing process. Xerces parser cut processed file to 16K peaces. If
25 * the parser cross the 16K border reported in the characters() position is
26 * zerro.
27 *
28 * This bug could be covered if you extend this content handler instead of
29 * org.xml.sax.helpers.DefaultHandler.
30 *
31 * The real content handler should call methods startDocument() and characters()
32 * in order to compute correct position in file. The corrected position could be
33 * received throug protected data member m_realOffset or with getRealOffset().
34 */
35 public class XmlPositionCorrectionHandler extends DefaultHandler {
36
37 /** Debug flag */
38 private static final boolean DEBUG = false;
39
40 /**
41 * Variables for correction of 16K parser limit for offset
42 */
43 protected long m_realOffset;
44 private int m_lastPosition;
45 private int m_lastSize;
46 private int m_multiplyer;
47
48 /** Constructor for initialization of variables */
49 public XmlPositionCorrectionHandler() {
50 m_realOffset = 0;
51 m_lastPosition = 0;
52 m_lastSize = 0;
53 m_multiplyer = 0;
54 } // XmlPositionCorrectionHandler
55
56 /** Initialization of variables on start of document parsing */
57 public void startDocument() throws org.xml.sax.SAXException {
58 m_realOffset = 0;
59 m_lastPosition = 0;
60 m_lastSize = 0;
61 m_multiplyer = 0;
62 } // startDocument
63
64 /** Return corrected offset for last characters() call */
65 public long getRealOffset() {
66 return m_realOffset;
67 } // getRealOffset
68
69 /** Here is the correction of the Xerces parser bug. */
70 public void characters(char[] text, int offset, int len)
71 throws org.xml.sax.SAXException {
72 if(offset == 0 && len == 1 && text.length <= 2) {
73 // unicode char or &xxx; coding
74 return;
75 } // if
76
77 // There is 16K limit for offset. Here is the correction.
78 // Will catch the bug in most cases.
79 if(m_lastPosition - offset > 0x2000
80 || (offset == 0 && m_lastSize+m_lastPosition > 0x3000) ) {
81 m_multiplyer++;
82 }
83 m_lastPosition = offset;
84 m_lastSize = len;
85 m_realOffset = m_multiplyer*0x4000+offset;
86 } // characters
87
88 } // XmlPositionCorrectionHandler
|