001 /*
002 * Parser.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: Parser.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic;
009
010 import java.io.IOException;
011 import java.io.StringReader;
012 import java.util.ArrayList;
013 import java.util.Iterator;
014 import java.util.List;
015 import java.util.Map;
016 import java.util.Set;
017
018 import org.jdom.Element;
019 import org.jdom.JDOMException;
020 import org.jdom.input.SAXBuilder;
021
022 /**
023 * This class provides utility methods to export the Hits to XML and
024 * read them back from XML to HIT objects.
025 *
026 * @author niraj
027 *
028 */
029 public class Parser {
030
031 /**
032 * HITS XML Element
033 */
034 public static final String HITS = "HITS";
035
036 /**
037 * HIT XML Element
038 */
039 public static final String HIT = "HIT";
040
041 /**
042 * DOC_ID XML Element
043 */
044 public static final String DOC_ID = "DOC_ID";
045
046 /**
047 * ANNOTATION_SET_NAME XML Element
048 */
049 public static final String ANNOTATION_SET_NAME = "ANNOTATION_SET_NAME";
050
051 /**
052 * START_OFFSET XML Element
053 */
054 public static final String START_OFFSET = "START_OFFSET";
055
056 /**
057 * END_OFFSET XML Element
058 */
059 public static final String END_OFFSET = "END_OFFSET";
060
061 /**
062 * QUERY XML Element
063 */
064 public static final String QUERY = "QUERY";
065
066 /**
067 * LEFT_CONTEXT_START_OFFSET XML Element
068 */
069 public static final String LEFT_CONTEXT_START_OFFSET = "LEFT_CONTEXT_START_OFFSET";
070
071 /**
072 * RIGHT_CONTEXT_END_OFFSET XML Element
073 */
074 public static final String RIGHT_CONTEXT_END_OFFSET = "RIGHT_CONTEXT_END_OFFSET";
075
076 /**
077 * PATTERN_TEXT XML Element
078 */
079 public static final String PATTERN_TEXT = "PATTERN_TEXT";
080
081 /**
082 * PATTERN_ANNOTATIONS XML Element
083 */
084 public static final String PATTERN_ANNOTATIONS = "PATTERN_ANNOTATIONS";
085
086 /**
087 * PATTERN_ANNOTATION XML Element
088 */
089 public static final String PATTERN_ANNOTATION = "PATTERN_ANNOTATION";
090
091 /**
092 * START XML Element
093 */
094 public static final String START = "START";
095
096 /**
097 * END XML Element
098 */
099 public static final String END = "END";
100
101 /**
102 * TEXT XML Element
103 */
104 public static final String TEXT = "TEXT";
105
106 /**
107 * TYPE XML Element
108 */
109 public static final String TYPE = "TYPE";
110
111 /**
112 * POSITION XML Element
113 */
114 public static final String POSITION = "POSITION";
115
116 /**
117 * FEATURES XML Element
118 */
119 public static final String FEATURES = "FEATURES";
120
121 /**
122 * FEATURE XML Element
123 */
124 public static final String FEATURE = "FEATURE";
125
126 /**
127 * KEY XML Element
128 */
129 public static final String KEY = "KEY";
130
131 /**
132 * VALUE XML Element
133 */
134 public static final String VALUE = "VALUE";
135
136 /**
137 * Given an array of instances of Hit, this method returns an xml
138 * representation of the Hit
139 *
140 * @param hits
141 * @return
142 */
143 public static String toXML(Hit[] hits) {
144 StringBuffer sb = new StringBuffer();
145
146 // first sentence
147 sb.append("<?xml version=\"1.0\"?>\n");
148
149 // root element
150 sb.append(wrap(HITS, true));
151
152 // iterating through each hit
153 for(int i = 0; i < hits.length; i++) {
154
155 // adding a hit element
156 sb.append(wrap(HIT, true));
157 sb.append(wrap(DOC_ID, hits[i].documentID));
158 sb.append(wrap(ANNOTATION_SET_NAME, hits[i].annotationSetName));
159 sb.append(wrap(START_OFFSET, hits[i].startOffset));
160 sb.append(wrap(END_OFFSET, hits[i].endOffset));
161 sb.append(wrap(QUERY, hits[i].queryString));
162
163
164 // it hit is an instance of Pattern, we need to add further
165 // information as well
166 if(hits[i] instanceof Pattern) {
167 Pattern pat = (Pattern)hits[i];
168 sb.append(wrap(LEFT_CONTEXT_START_OFFSET, pat
169 .getLeftContextStartOffset()));
170 sb
171 .append(wrap(RIGHT_CONTEXT_END_OFFSET, pat
172 .getRightContextEndOffset()));
173 sb.append(wrap(PATTERN_TEXT, pat.getPatternText()));
174
175 PatternAnnotation[] annots = pat.getPatternAnnotations();
176
177 // all annotations should be exported
178 sb.append(wrap(PATTERN_ANNOTATIONS, true));
179
180 // one annotation at a time
181 for(int j = 0; j < annots.length; j++) {
182 sb.append(wrap(PATTERN_ANNOTATION, true));
183 sb.append(wrap(START, annots[j].getStartOffset()));
184 sb.append(wrap(END, annots[j].getEndOffset()));
185 sb.append(wrap(TEXT, annots[j].getText()));
186 sb.append(wrap(TYPE, annots[j].getType()));
187 sb.append(wrap(POSITION, annots[j].getPosition()));
188 // exporting features as well
189 Map<String, String> features = annots[j].getFeatures();
190 sb.append(wrap(FEATURES, true));
191 // one feature at a time
192 if(features != null) {
193 Set<String> keySet = features.keySet();
194 if(keySet != null) {
195 Iterator<String> iter = keySet.iterator();
196 while(iter.hasNext()) {
197 sb.append(wrap(FEATURE, true));
198 String key = iter.next();
199 sb.append(wrap(KEY, key));
200 String value = features.get(key);
201 sb.append(wrap(VALUE, value));
202 sb.append(wrap(FEATURE, false));
203 }
204 }
205 }
206 sb.append(wrap(FEATURES, false));
207 sb.append(wrap(PATTERN_ANNOTATION, false));
208 }
209 sb.append(wrap(PATTERN_ANNOTATIONS, false));
210 }
211 sb.append(wrap(HIT, false));
212 }
213 sb.append(wrap(HITS, false));
214 return sb.toString();
215 }
216
217 /**
218 * This method replaces all the special characters (invalid xml characters) with their respective legal sequences.
219 * These includes &, <, >, \ and '.
220 * @param s
221 * @return
222 */
223 public static String replaceAmpChars(String s) {
224 s = s.replaceAll("&", "&");
225 s = s.replaceAll("<", "<");
226 s = s.replaceAll(">", ">");
227 s = s.replaceAll("\"", """);
228 s = s.replaceAll("'", "'");
229 return s;
230 }
231
232 /**
233 * Given xml representation of HIT converts them into an array of hits
234 *
235 * @param xml
236 * @return
237 * @throws IOException
238 */
239 public static Hit[] fromXML(String xml) throws IOException, JDOMException {
240 SAXBuilder saxBuilder = new SAXBuilder(false);
241 org.jdom.Document jdomDoc = saxBuilder.build(new StringReader(xml));
242 Element rootElement = jdomDoc.getRootElement();
243 if(!rootElement.getName().equalsIgnoreCase(HITS)) {
244 throw new IOException("Root element must be " + HITS);
245 }
246
247 // rootElement is HITS
248 // this will internally contains instances of HIT
249 List hitsChildren = rootElement.getChildren(HIT);
250 Hit[] hits = new Hit[hitsChildren.size()];
251
252 for(int i = 0; i < hitsChildren.size(); i++) {
253 Element hitElem = (Element)hitsChildren.get(i);
254 int startOffset = Integer.parseInt(hitElem.getChildText(START_OFFSET));
255 int endOffset = Integer.parseInt(hitElem.getChildText(END_OFFSET));
256 String docID = hitElem.getChildText(DOC_ID);
257 String annotationSetName = hitElem.getChildText(ANNOTATION_SET_NAME);
258 String queryString = hitElem.getChildText(QUERY);
259
260 Element patternAnnotations = hitElem.getChild(PATTERN_ANNOTATIONS);
261 if(patternAnnotations == null) {
262 hits[i] = new Hit(docID, annotationSetName, startOffset, endOffset, queryString);
263 continue;
264 }
265
266 List patAnnots = patternAnnotations.getChildren(PATTERN_ANNOTATION);
267 List<PatternAnnotation> patAnnotsList = new ArrayList<PatternAnnotation>();
268 for(int j = 0; j < patAnnots.size(); j++) {
269 Element patAnnot = (Element)patAnnots.get(j);
270 PatternAnnotation pa = new PatternAnnotation();
271 pa.setStOffset(Integer.parseInt(patAnnot.getChildText(START)));
272 pa.setEnOffset(Integer.parseInt(patAnnot.getChildText(END)));
273 pa.setPosition(Integer.parseInt(patAnnot.getChildText(POSITION)));
274 pa.setText(patAnnot.getChildText(TEXT));
275 pa.setType(patAnnot.getChildText(TYPE));
276
277 // we need to find out its features
278 Element featuresElem = patAnnot.getChild(FEATURES);
279 // more than one features possible
280 List featuresElemsList = featuresElem.getChildren(FEATURE);
281 for(int k = 0; k < featuresElemsList.size(); k++) {
282 Element featureElem = (Element)featuresElemsList.get(k);
283 String key = featureElem.getChildText(KEY);
284 String value = featureElem.getChildText(VALUE);
285 pa.addFeature(key, value);
286 }
287 patAnnotsList.add(pa);
288 }
289
290 String patternText = hitElem.getChildText(PATTERN_TEXT);
291 int leftCSO = Integer.parseInt(hitElem
292 .getChildText(LEFT_CONTEXT_START_OFFSET));
293 int rightCEO = Integer.parseInt(hitElem
294 .getChildText(RIGHT_CONTEXT_END_OFFSET));
295
296 hits[i] = new Pattern(docID, annotationSetName, patternText, startOffset, endOffset,
297 leftCSO, rightCEO, patAnnotsList, queryString);
298 }
299 return hits;
300 }
301
302 /**
303 * wraps the element into the following format
304 *
305 * @param elementText
306 * @param elementValue
307 * @return <elementText>elementValue</elementText>\n
308 */
309 public static String wrap(String elementText, String elementValue) {
310 if(elementValue == null) {
311 return "<" + elementText + "> </" + elementText + ">\n";
312 }
313 return "<" + elementText + ">" + replaceAmpChars(elementValue) + "</"
314 + elementText + ">\n";
315 }
316
317 /**
318 * wraps the element into the following format
319 *
320 * @param elementText
321 * @param elementValue
322 * @return <elementText>elementValue</elementText>\n
323 */
324 public static String wrap(String elementText, int elementValue) {
325 return wrap(elementText, "" + elementValue);
326 }
327
328 /**
329 * wraps the element into the following format
330 *
331 * @param elementText
332 * @param startElement
333 * @return "<" + (startElement ? "" : "/") + elementText + ">\n";
334 */
335 public static String wrap(String elementText, boolean startElement) {
336 return "<" + (startElement ? "" : "/") + elementText + ">\n";
337 }
338 }
|