001 /*
002 * DocumentXmlUtils.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Ian Roberts, 20/Jul/2006
013 *
014 * $Id: DocumentXmlUtils.java 12950 2010-08-11 18:58:56Z bensonmargulies $
015 */
016 package gate.corpora;
017
018 import gate.Document;
019 import gate.TextualDocument;
020 import gate.Annotation;
021 import gate.AnnotationSet;
022 import gate.FeatureMap;
023 import gate.event.StatusListener;
024 import gate.util.Strings;
025 import gate.util.Err;
026
027 import java.util.Collection;
028 import java.util.Map;
029 import java.util.HashMap;
030 import java.util.TreeMap;
031 import java.util.Set;
032 import java.util.TreeSet;
033 import java.util.Iterator;
034
035 /**
036 * This class is contains useful static methods for working with the GATE XML
037 * format. Many of the methods in this class were originally in {@link
038 * DocumentImpl} but as they are not specific to any one implementation of the
039 * <code>Document</code> interface they have been moved here.
040 */
041 public class DocumentXmlUtils {
042
043 /**
044 * This field is used when creating StringBuffers for toXml() methods. The
045 * size of the StringBuffer will be docDonctent.size() multiplied by this
046 * value. It is aimed to improve the performance of StringBuffer
047 */
048 public static final int DOC_SIZE_MULTIPLICATION_FACTOR = 40;
049
050 /**
051 * Returns a GateXml document that is a custom XML format for wich there is a
052 * reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
053 * serialize a GATE document in an XML format.
054 *
055 * @param doc the document to serialize.
056 * @return a string representing a Gate Xml document.
057 */
058 public static String toXml(TextualDocument doc) {
059 // Initialize the xmlContent several time the size of the current document.
060 // This is because of the tags size. This measure is made to increase the
061 // performance of StringBuffer.
062 StringBuffer xmlContent = new StringBuffer(
063 DOC_SIZE_MULTIPLICATION_FACTOR
064 * (doc.getContent().size().intValue()));
065 // Add xml header
066 xmlContent.append("<?xml version=\"1.0\" encoding=\"");
067 xmlContent.append(doc.getEncoding());
068 xmlContent.append("\" ?>");
069 xmlContent.append(Strings.getNl());
070 // Add the root element
071 xmlContent.append("<GateDocument>\n");
072 xmlContent.append("<!-- The document's features-->\n\n");
073 xmlContent.append("<GateDocumentFeatures>\n");
074 xmlContent.append(featuresToXml(doc.getFeatures(),null));
075 xmlContent.append("</GateDocumentFeatures>\n");
076 xmlContent.append("<!-- The document content area with serialized"
077 + " nodes -->\n\n");
078 // Add plain text element
079 xmlContent.append("<TextWithNodes>");
080 xmlContent.append(textWithNodes(doc, doc.getContent().toString()));
081 xmlContent.append("</TextWithNodes>\n");
082 // Serialize as XML all document's annotation sets
083 // Serialize the default AnnotationSet
084 StatusListener sListener = (StatusListener)gate.Gate
085 .getListeners().get("gate.event.StatusListener");
086 if(sListener != null)
087 sListener.statusChanged("Saving the default annotation set ");
088 xmlContent.append("<!-- The default annotation set -->\n\n");
089 annotationSetToXml(doc.getAnnotations(), xmlContent);
090 // Serialize all others AnnotationSets
091 // namedAnnotSets is a Map containing all other named Annotation Sets.
092 Map namedAnnotSets = doc.getNamedAnnotationSets();
093 if(namedAnnotSets != null) {
094 Iterator iter = namedAnnotSets.values().iterator();
095 while(iter.hasNext()) {
096 AnnotationSet annotSet = (AnnotationSet)iter.next();
097 xmlContent.append("<!-- Named annotation set -->\n\n");
098 // Serialize it as XML
099 if(sListener != null)
100 sListener.statusChanged("Saving " + annotSet.getName()
101 + " annotation set ");
102 annotationSetToXml(annotSet, xmlContent);
103 }// End while
104 }// End if
105 // Add the end of GateDocument
106 xmlContent.append("</GateDocument>");
107 if(sListener != null) sListener.statusChanged("Done !");
108 // return the XmlGateDocument
109 return xmlContent.toString();
110 }
111
112
113 /**
114 * This method saves a FeatureMap as XML elements.
115 *
116 * @param aFeatureMap
117 * the feature map that has to be saved as XML.
118 * @return a String like this: <Feature><Name>...</Name> <Value>...</Value></Feature><Feature>...</Feature>
119 */
120 public static StringBuffer featuresToXml(FeatureMap aFeatureMap, Map normalizedFeatureNames) {
121 if(aFeatureMap == null) return new StringBuffer();
122 StringBuffer buffer = new StringBuffer(1024);
123 Set keySet = aFeatureMap.keySet();
124 Iterator keyIterator = keySet.iterator();
125 while(keyIterator.hasNext()) {
126 Object key = keyIterator.next();
127 Object value = aFeatureMap.get(key);
128 if((key != null) && (value != null)) {
129 String keyClassName = null;
130 String keyItemClassName = null;
131 String valueClassName = null;
132 String valueItemClassName = null;
133 String key2String = key.toString();
134 String value2String = value.toString();
135 Object item = null;
136 // Test key if it is String, Number or Collection
137 if(key instanceof java.lang.String || key instanceof java.lang.Number
138 || key instanceof java.util.Collection)
139 keyClassName = key.getClass().getName();
140 // Test value if it is String, Number or Collection
141 if(value instanceof java.lang.String
142 || value instanceof java.lang.Number
143 || value instanceof java.util.Collection)
144 valueClassName = value.getClass().getName();
145 // Features and values that are not Strings, Numbers or collections
146 // will be discarded.
147 if(keyClassName == null || valueClassName == null) continue;
148 // If key is collection serialize the collection in a specific format
149 if(key instanceof java.util.Collection) {
150 StringBuffer keyStrBuff = new StringBuffer();
151 Iterator iter = ((Collection)key).iterator();
152 if(iter.hasNext()) {
153 item = iter.next();
154 if(item instanceof java.lang.Number)
155 keyItemClassName = item.getClass().getName();
156 else keyItemClassName = String.class.getName();
157 keyStrBuff.append(item.toString());
158 }// End if
159 while(iter.hasNext()) {
160 item = iter.next();
161 keyStrBuff.append(";").append(item.toString());
162 }// End while
163 key2String = keyStrBuff.toString();
164 }// End if
165 // If key is collection serialize the colection in a specific format
166 if(value instanceof java.util.Collection) {
167 StringBuffer valueStrBuff = new StringBuffer();
168 Iterator iter = ((Collection)value).iterator();
169 if(iter.hasNext()) {
170 item = iter.next();
171 if(item instanceof java.lang.Number)
172 valueItemClassName = item.getClass().getName();
173 else valueItemClassName = String.class.getName();
174 valueStrBuff.append(item.toString());
175 }// End if
176 while(iter.hasNext()) {
177 item = iter.next();
178 valueStrBuff.append(";").append(item.toString());
179 }// End while
180 value2String = valueStrBuff.toString();
181 }// End if
182 buffer.append("<Feature>\n <Name");
183 if(keyClassName != null)
184 buffer.append(" className=\"").append(keyClassName).append("\"");
185 if(keyItemClassName != null)
186 buffer.append(" itemClassName=\"").append(keyItemClassName).append(
187 "\"");
188 buffer.append(">");
189
190 // use a map of keys already checked for XML validity
191 StringBuffer normalizedKey = new StringBuffer(key2String);
192 if (normalizedFeatureNames!=null){
193 // has this key been already converted ?
194 normalizedKey = (StringBuffer)normalizedFeatureNames.get(key2String);
195 if (normalizedKey==null){
196 // never seen so far!
197 normalizedKey= combinedNormalisation(key2String);
198 normalizedFeatureNames.put(key2String,normalizedKey);
199 }
200 }
201 else normalizedKey = combinedNormalisation(key2String);
202
203 buffer.append(normalizedKey);
204 buffer.append("</Name>\n <Value");
205 if(valueClassName != null)
206 buffer.append(" className=\"").append(valueClassName).append("\"");
207 if(valueItemClassName != null)
208 buffer.append(" itemClassName=\"").append(valueItemClassName).append(
209 "\"");
210 buffer.append(">");
211 buffer.append(combinedNormalisation(value2String));
212 buffer.append("</Value>\n</Feature>\n");
213 }// End if
214 }// end While
215 return buffer;
216 }// featuresToXml
217
218
219 /**
220 * Combines replaceCharsWithEntities and filterNonXmlChars in a single method
221 **/
222 public static StringBuffer combinedNormalisation(String inputString){
223 if(inputString == null) return new StringBuffer("");
224 StringBuffer buffer = new StringBuffer(inputString);
225 for (int i=buffer.length()-1; i>=0; i--){
226 char currentchar = buffer.charAt(i);
227 // is the current character an xml char which needs replacing?
228 if(!isXmlChar(currentchar)) buffer.replace(i,i+1," ");
229 // is the current character an xml char which needs replacing?
230 else if(currentchar == '<' || currentchar == '>' || currentchar == '&'|| currentchar == '\''|| currentchar == '\"' || currentchar == 0xA0 || currentchar == 0xA9)
231 buffer.replace(i,i+1,(String) entitiesMap.get(new Character(currentchar)));
232 }
233 return buffer;
234 }
235
236 /**
237 * This method filters any non XML char see:
238 * http://www.w3c.org/TR/2000/REC-xml-20001006#charsets All non XML chars will
239 * be replaced with 0x20 (space char) This assures that the next time the
240 * document is loaded there won't be any problems.
241 *
242 * @param aStrBuffer
243 * represents the input String that is filtred. If the aStrBuffer is
244 * null then an empty string will be returend
245 * @return the "purified" StringBuffer version of the aStrBuffer
246 */
247 public static StringBuffer filterNonXmlChars(StringBuffer aStrBuffer) {
248 if(aStrBuffer == null) return new StringBuffer("");
249 // String space = new String(" ");
250 char space = ' ';
251 for(int i = aStrBuffer.length() - 1; i >= 0; i--) {
252 if(!isXmlChar(aStrBuffer.charAt(i))) aStrBuffer.setCharAt(i, space);
253 }// End for
254 return aStrBuffer;
255 }// filterNonXmlChars()
256
257 /**
258 * This method decide if a char is a valid XML one or not
259 *
260 * @param ch
261 * the char to be tested
262 * @return true if is a valid XML char and fals if is not.
263 */
264 public static boolean isXmlChar(char ch) {
265 if(ch == 0x9 || ch == 0xA || ch == 0xD) return true;
266 if((0x20 <= ch) && (ch <= 0xD7FF)) return true;
267 if((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
268 if((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
269 return false;
270 }// End isXmlChar()
271
272
273 /** This method replace all chars that appears in the anInputString and also
274 * that are in the entitiesMap with their corresponding entity
275 * @param anInputString the string analyzed. If it is null then returns the
276 * empty string
277 * @return a string representing the input string with chars replaced with
278 * entities
279 */
280 public static StringBuffer replaceCharsWithEntities(String anInputString){
281 if (anInputString == null) return new StringBuffer("");
282 StringBuffer strBuff = new StringBuffer(anInputString);
283 for (int i=strBuff.length()-1; i>=0; i--){
284 Character ch = new Character(strBuff.charAt(i));
285 if (entitiesMap.keySet().contains(ch)){
286 strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
287 }// End if
288 }// End for
289 return strBuff;
290 }// replaceCharsWithEntities()
291
292 /**
293 * Returns the document's text interspersed with <Node> elements at all
294 * points where the document has an annotation beginning or ending.
295 */
296 public static String textWithNodes(TextualDocument doc, String aText) {
297 // filterNonXmlChars
298 // getoffsets for Nodes
299 // getoffsets for XML entities
300 if(aText == null) return new String("");
301 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
302 // Construct a map from offsets to Chars ()
303 TreeMap offsets2CharsMap = new TreeMap();
304 if(aText.length() != 0) {
305 // Fill the offsets2CharsMap with all the indices where special chars
306 // appear
307 buildEntityMapFromString(aText, offsets2CharsMap);
308 }// End if
309 // Construct the offsetsSet for all nodes belonging to this document
310 TreeSet offsetsSet = new TreeSet();
311 Iterator<Annotation> annotSetIter = doc.getAnnotations().iterator();
312 while(annotSetIter.hasNext()) {
313 Annotation annot = annotSetIter.next();
314 offsetsSet.add(annot.getStartNode().getOffset());
315 offsetsSet.add(annot.getEndNode().getOffset());
316 }// end While
317 // Get the nodes from all other named annotation sets.
318 Map namedAnnotSets = doc.getNamedAnnotationSets();
319 if(namedAnnotSets != null) {
320 Iterator iter = namedAnnotSets.values().iterator();
321 while(iter.hasNext()) {
322 AnnotationSet annotSet = (AnnotationSet)iter.next();
323 Iterator<Annotation> iter2 = annotSet.iterator();
324 while(iter2.hasNext()) {
325 Annotation annotTmp = iter2.next();
326 offsetsSet.add(annotTmp.getStartNode().getOffset());
327 offsetsSet.add(annotTmp.getEndNode().getOffset());
328 }// End while
329 }// End while
330 }// End if
331 // offsetsSet is ordered in ascending order because the structure
332 // is a TreeSet
333 if(offsetsSet.isEmpty()) { return replaceCharsWithEntities(aText)
334 .toString(); }// End if
335
336 // create a large StringBuffer
337 StringBuffer modifiedBuffer = new StringBuffer(textWithNodes.length() * 2);
338
339 // last character copied from the original String
340 int lastCharactercopied = 0;
341
342 // append to buffer all text up to next offset
343 // for node or entity
344 // we need to iterate on offsetSet and offsets2CharsMap
345 Set allOffsets = new TreeSet();
346 allOffsets.addAll(offsetsSet);
347 allOffsets.addAll(offsets2CharsMap.keySet());
348 Iterator allOffsetsIterator = allOffsets.iterator();
349 while (allOffsetsIterator.hasNext()){
350 Long nextOffset = (Long)allOffsetsIterator.next();
351 int nextOffsetint = nextOffset.intValue();
352 // is there some text to add since last time?
353 if (nextOffsetint>lastCharactercopied){
354 modifiedBuffer.append(textWithNodes.substring(lastCharactercopied,nextOffsetint));
355 lastCharactercopied=nextOffsetint;
356 }
357 // do we need to add a node information here?
358 if (offsetsSet.contains(nextOffset))
359 modifiedBuffer.append("<Node id=\"").append(nextOffsetint).append("\"/>");
360
361 // do we need to convert an XML entity?
362 if (offsets2CharsMap.containsKey(nextOffset)){
363 String entityString = (String)entitiesMap.get((Character)offsets2CharsMap.get(nextOffset));
364 // skip the character in the original String
365 lastCharactercopied++;
366 // append the corresponding entity
367 modifiedBuffer.append(entityString);
368 }
369 }
370 // copies the remaining text
371 modifiedBuffer.append(textWithNodes.substring(lastCharactercopied,textWithNodes.length()));
372
373 return modifiedBuffer.toString();
374 }
375
376 /**
377 * This method takes aScanString and searches for those chars from entitiesMap
378 * that appear in the string. A tree map(offset2Char) is filled using as key
379 * the offsets where those Chars appear and the Char. If one of the params is
380 * null the method simply returns.
381 */
382 public static void buildEntityMapFromString(String aScanString, TreeMap aMapToFill) {
383 if(aScanString == null || aMapToFill == null) return;
384 if(entitiesMap == null || entitiesMap.isEmpty()) {
385 Err.prln("WARNING: Entities map was not initialised !");
386 return;
387 }// End if
388 // Fill the Map with the offsets of the special chars
389 Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
390 Character c;
391 int fromIndex;
392 while(entitiesMapIterator.hasNext()) {
393 c = (Character)entitiesMapIterator.next();
394 fromIndex = 0;
395 while(-1 != fromIndex) {
396 fromIndex = aScanString.indexOf(c.charValue(), fromIndex);
397 if(-1 != fromIndex) {
398 aMapToFill.put(new Long(fromIndex), c);
399 fromIndex++;
400 }// End if
401 }// End while
402 }// End while
403 }// buildEntityMapFromString();
404
405 /**
406 * This method saves an AnnotationSet as XML.
407 *
408 * @param anAnnotationSet
409 * The annotation set that has to be saved as XML.
410 * @return a String like this: <AnnotationSet> <Annotation>....
411 * </AnnotationSet>
412 */
413 public static void annotationSetToXml(AnnotationSet anAnnotationSet,
414 StringBuffer buffer) {
415 if(anAnnotationSet == null) {
416 buffer.append("<AnnotationSet>\n");
417 buffer.append("</AnnotationSet>\n");
418 return;
419 }// End if
420 if(anAnnotationSet.getName() == null)
421 buffer.append("<AnnotationSet>\n");
422 else {
423 buffer.append("<AnnotationSet Name=\"");
424 buffer.append(anAnnotationSet.getName());
425 buffer.append("\" >\n");
426 }
427 HashMap convertedKeys = new HashMap();
428 // Iterate through AnnotationSet and save each Annotation as XML
429 Iterator<Annotation> iterator = anAnnotationSet.iterator();
430 while(iterator.hasNext()) {
431 Annotation annot = iterator.next();
432 buffer.append("<Annotation Id=\"");
433 buffer.append(annot.getId());
434 buffer.append("\" Type=\"");
435 buffer.append(annot.getType());
436 buffer.append("\" StartNode=\"");
437 buffer.append(annot.getStartNode().getOffset());
438 buffer.append("\" EndNode=\"");
439 buffer.append(annot.getEndNode().getOffset());
440 buffer.append("\">\n");
441 buffer.append(featuresToXml(annot.getFeatures(),convertedKeys));
442 buffer.append("</Annotation>\n");
443 }// End while
444 buffer.append("</AnnotationSet>\n");
445 }// annotationSetToXml
446
447
448 /**
449 * This method saves an AnnotationSet as XML.
450 *
451 * @param anAnnotationSet
452 * The annotation set that has to be saved as XML.
453 * @param annotationSetNameToUse
454 * The standard annotationSetToXml(AnnotaionSet, StringBuffer) uses the name that belongs to the provided annotation set,
455 * however, this method allows one to store the provided annotation set under a different annotation set name.
456 * @return a String like this: <AnnotationSet> <Annotation>....
457 * </AnnotationSet>
458 */
459 public static void annotationSetToXml(AnnotationSet anAnnotationSet, String annotationSetNameToUse,
460 StringBuffer buffer) {
461 if(anAnnotationSet == null) {
462 buffer.append("<AnnotationSet>\n");
463 buffer.append("</AnnotationSet>\n");
464 return;
465 }// End if
466 if(annotationSetNameToUse == null || annotationSetNameToUse.trim().length() == 0)
467 buffer.append("<AnnotationSet>\n");
468 else {
469 buffer.append("<AnnotationSet Name=\"");
470 buffer.append(annotationSetNameToUse);
471 buffer.append("\" >\n");
472 }
473 HashMap convertedKeys = new HashMap();
474 // Iterate through AnnotationSet and save each Annotation as XML
475 Iterator<Annotation> iterator = anAnnotationSet.iterator();
476 while(iterator.hasNext()) {
477 Annotation annot = iterator.next();
478 buffer.append("<Annotation Id=\"");
479 buffer.append(annot.getId());
480 buffer.append("\" Type=\"");
481 buffer.append(annot.getType());
482 buffer.append("\" StartNode=\"");
483 buffer.append(annot.getStartNode().getOffset());
484 buffer.append("\" EndNode=\"");
485 buffer.append(annot.getEndNode().getOffset());
486 buffer.append("\">\n");
487 buffer.append(featuresToXml(annot.getFeatures(),convertedKeys));
488 buffer.append("</Annotation>\n");
489 }// End while
490 buffer.append("</AnnotationSet>\n");
491 }// annotationSetToXml
492
493 /**
494 * A map initialized in init() containing entities that needs to be replaced
495 * in strings
496 */
497 public static Map entitiesMap = null;
498 // Initialize the entities map use when saving as xml
499 static {
500 entitiesMap = new HashMap();
501 entitiesMap.put(new Character('<'), "<");
502 entitiesMap.put(new Character('>'), ">");
503 entitiesMap.put(new Character('&'), "&");
504 entitiesMap.put(new Character('\''), "'");
505 entitiesMap.put(new Character('"'), """);
506 entitiesMap.put(new Character((char)160), " ");
507 entitiesMap.put(new Character((char)169), "©");
508 }// static
509 }
|