0001 /*
0002 * DocumentStaxUtils.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Ian Roberts, 20/Jul/2006
0013 *
0014 * $Id: DocumentStaxUtils.java 13591 2011-03-31 13:25:30Z murfffi $
0015 */
0016 package gate.corpora;
0017
0018 import java.io.BufferedWriter;
0019 import java.io.File;
0020 import java.io.FileOutputStream;
0021 import java.io.IOException;
0022 import java.io.InputStream;
0023 import java.io.OutputStream;
0024 import java.io.OutputStreamWriter;
0025 import java.io.StringWriter;
0026 import java.lang.reflect.Constructor;
0027 import java.util.ArrayList;
0028 import java.util.Collection;
0029 import java.util.Collections;
0030 import java.util.Comparator;
0031 import java.util.HashMap;
0032 import java.util.Iterator;
0033 import java.util.List;
0034 import java.util.Map;
0035 import java.util.Set;
0036 import java.util.SortedSet;
0037 import java.util.StringTokenizer;
0038 import java.util.TreeSet;
0039 import java.util.regex.Matcher;
0040 import java.util.regex.Pattern;
0041
0042 import javax.xml.stream.XMLInputFactory;
0043 import javax.xml.stream.XMLOutputFactory;
0044 import javax.xml.stream.XMLStreamConstants;
0045 import javax.xml.stream.XMLStreamException;
0046 import javax.xml.stream.XMLStreamReader;
0047 import javax.xml.stream.XMLStreamWriter;
0048
0049 import gate.Annotation;
0050 import gate.AnnotationSet;
0051 import gate.Document;
0052 import gate.DocumentContent;
0053 import gate.Factory;
0054 import gate.FeatureMap;
0055 import gate.Gate;
0056 import gate.TextualDocument;
0057 import gate.event.StatusListener;
0058 import gate.util.GateException;
0059 import gate.util.GateRuntimeException;
0060 import gate.util.InvalidOffsetException;
0061 import gate.util.Out;
0062
0063 /**
0064 * This class provides support for reading and writing GATE XML format
0065 * using StAX (the Streaming API for XML).
0066 */
0067 public class DocumentStaxUtils {
0068
0069 private static XMLInputFactory inputFactory = null;
0070
0071 /**
0072 * The char used to replace characters in text content that are
0073 * illegal in XML.
0074 */
0075 public static final char INVALID_CHARACTER_REPLACEMENT = ' ';
0076
0077 /**
0078 * The number of < signs after which we encode a string using CDATA
0079 * rather than writeCharacters.
0080 */
0081 public static final int LT_THRESHOLD = 5;
0082
0083 /**
0084 * Reads GATE XML format data from the given XMLStreamReader and puts
0085 * the content and annotation sets into the given Document, replacing
0086 * its current content. The reader must be positioned on the opening
0087 * GateDocument tag (i.e. the last event was a START_ELEMENT for which
0088 * getLocalName returns "GateDocument"), and when the method returns
0089 * the reader will be left positioned on the corresponding closing
0090 * tag.
0091 *
0092 * @param xsr the source of the XML to parse
0093 * @param doc the document to update
0094 * @throws XMLStreamException
0095 */
0096 public static void readGateXmlDocument(XMLStreamReader xsr, Document doc)
0097 throws XMLStreamException {
0098 readGateXmlDocument(xsr, doc, null);
0099 }
0100
0101 /**
0102 * Reads GATE XML format data from the given XMLStreamReader and puts
0103 * the content and annotation sets into the given Document, replacing
0104 * its current content. The reader must be positioned on the opening
0105 * GateDocument tag (i.e. the last event was a START_ELEMENT for which
0106 * getLocalName returns "GateDocument"), and when the method returns
0107 * the reader will be left positioned on the corresponding closing
0108 * tag.
0109 *
0110 * @param xsr the source of the XML to parse
0111 * @param doc the document to update
0112 * @param statusListener optional status listener to receive status
0113 * messages
0114 * @throws XMLStreamException
0115 */
0116 public static void readGateXmlDocument(XMLStreamReader xsr, Document doc,
0117 StatusListener statusListener) throws XMLStreamException {
0118 DocumentContent savedContent = null;
0119
0120 // check the precondition
0121 xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocument");
0122
0123 // process the document features
0124 xsr.nextTag();
0125 xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocumentFeatures");
0126
0127 if(statusListener != null) {
0128 statusListener.statusChanged("Reading document features");
0129 }
0130 FeatureMap documentFeatures = readFeatureMap(xsr);
0131
0132 // read document text, building the map of node IDs to offsets
0133 xsr.nextTag();
0134 xsr.require(XMLStreamConstants.START_ELEMENT, null, "TextWithNodes");
0135
0136 Map<Integer, Long> nodeIdToOffsetMap = new HashMap<Integer, Long>();
0137 if(statusListener != null) {
0138 statusListener.statusChanged("Reading document content");
0139 }
0140 String documentText = readTextWithNodes(xsr, nodeIdToOffsetMap);
0141
0142 // save the content, in case anything goes wrong later
0143 savedContent = doc.getContent();
0144 // set the document content to the text with nodes text.
0145 doc.setContent(new DocumentContentImpl(documentText));
0146
0147 try {
0148 int numAnnots = 0;
0149 // process annotation sets, using the node map built above
0150 Integer maxAnnotId = null;
0151 // initially, we don't know whether annotation IDs are required or
0152 // not
0153 Boolean requireAnnotationIds = null;
0154 int eventType = xsr.nextTag();
0155 while(eventType == XMLStreamConstants.START_ELEMENT) {
0156 xsr.require(XMLStreamConstants.START_ELEMENT, null, "AnnotationSet");
0157 String annotationSetName = xsr.getAttributeValue(null, "Name");
0158 AnnotationSet annotationSet = null;
0159 if(annotationSetName == null) {
0160 if(statusListener != null) {
0161 statusListener.statusChanged("Reading default annotation set");
0162 }
0163 annotationSet = doc.getAnnotations();
0164 }
0165 else {
0166 if(statusListener != null) {
0167 statusListener.statusChanged("Reading \"" + annotationSetName
0168 + "\" annotation set");
0169 }
0170 annotationSet = doc.getAnnotations(annotationSetName);
0171 }
0172 annotationSet.clear();
0173 SortedSet<Integer> annotIdsInSet = new TreeSet<Integer>();
0174 requireAnnotationIds = readAnnotationSet(xsr, annotationSet,
0175 nodeIdToOffsetMap, annotIdsInSet, requireAnnotationIds);
0176 if(annotIdsInSet.size() > 0
0177 && (maxAnnotId == null || annotIdsInSet.last().intValue() > maxAnnotId
0178 .intValue())) {
0179 maxAnnotId = annotIdsInSet.last();
0180 }
0181 numAnnots += annotIdsInSet.size();
0182 // readAnnotationSet leaves reader positioned on the
0183 // </AnnotationSet> tag, so nextTag takes us to either the next
0184 // <AnnotationSet> or to the </GateDocument>
0185 eventType = xsr.nextTag();
0186 }
0187
0188 // check we are on the end document tag
0189 xsr.require(XMLStreamConstants.END_ELEMENT, null, "GateDocument");
0190
0191 doc.setFeatures(documentFeatures);
0192
0193 // set the ID generator, if doc is a DocumentImpl
0194 if(doc instanceof DocumentImpl && maxAnnotId != null) {
0195 ((DocumentImpl)doc).setNextAnnotationId(maxAnnotId.intValue() + 1);
0196 }
0197 if(statusListener != null) {
0198 statusListener.statusChanged("Finished. " + numAnnots
0199 + " annotation(s) processed");
0200 }
0201 }
0202 // in case of exception, reset document content to the unparsed XML
0203 catch(XMLStreamException xse) {
0204 doc.setContent(savedContent);
0205 throw xse;
0206 }
0207 catch(RuntimeException re) {
0208 doc.setContent(savedContent);
0209 throw re;
0210 }
0211 }
0212
0213 /**
0214 * Processes an AnnotationSet element from the given reader and fills
0215 * the given annotation set with the corresponding annotations. The
0216 * reader must initially be positioned on the starting AnnotationSet
0217 * tag and will be left positioned on the correspnding closing tag.
0218 *
0219 * @param xsr the reader
0220 * @param annotationSet the annotation set to fill.
0221 * @param nodeIdToOffsetMap a map mapping node IDs (Integer) to their
0222 * offsets in the text (Long). If null, we assume that the
0223 * node ids and offsets are the same (useful if parsing an
0224 * annotation set in isolation).
0225 * @param allAnnotIds a set to contain all annotation IDs specified in
0226 * the annotation set. It should initially be empty and will
0227 * be updated if any of the annotations in this set specify
0228 * an ID.
0229 * @param requireAnnotationIds whether annotations are required to
0230 * specify their IDs. If true, it is an error for an
0231 * annotation to omit the Id attribute. If false, it is an
0232 * error for the Id to be present. If null, we have not yet
0233 * determined what style of XML this is.
0234 * @return <code>requireAnnotationIds</code>. If the passed in
0235 * value was null, and we have since determined what it should
0236 * be, the updated value is returned.
0237 * @throws XMLStreamException
0238 */
0239 public static Boolean readAnnotationSet(XMLStreamReader xsr,
0240 AnnotationSet annotationSet, Map<Integer, Long> nodeIdToOffsetMap,
0241 Set<Integer> allAnnotIds, Boolean requireAnnotationIds)
0242 throws XMLStreamException {
0243 List<AnnotationObject> collectedAnnots = new ArrayList<AnnotationObject>();
0244 while(xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
0245 xsr.require(XMLStreamConstants.START_ELEMENT, null, "Annotation");
0246 AnnotationObject annObj = new AnnotationObject();
0247 annObj.setElemName(xsr.getAttributeValue(null, "Type"));
0248 try {
0249 int startNodeId = Integer.parseInt(xsr.getAttributeValue(null,
0250 "StartNode"));
0251 if(nodeIdToOffsetMap != null) {
0252 Long startOffset = nodeIdToOffsetMap.get(new Integer(startNodeId));
0253 if(startOffset != null) {
0254 annObj.setStart(startOffset);
0255 }
0256 else {
0257 throw new XMLStreamException("Invalid start node ID", xsr
0258 .getLocation());
0259 }
0260 }
0261 else {
0262 // no offset map, so just use the ID as an offset
0263 annObj.setStart(new Long(startNodeId));
0264 }
0265 }
0266 catch(NumberFormatException nfe) {
0267 throw new XMLStreamException("Non-integer value found for StartNode",
0268 xsr.getLocation());
0269 }
0270
0271 try {
0272 int endNodeId = Integer
0273 .parseInt(xsr.getAttributeValue(null, "EndNode"));
0274 if(nodeIdToOffsetMap != null) {
0275 Long endOffset = nodeIdToOffsetMap.get(new Integer(endNodeId));
0276 if(endOffset != null) {
0277 annObj.setEnd(endOffset);
0278 }
0279 else {
0280 throw new XMLStreamException("Invalid end node ID", xsr
0281 .getLocation());
0282 }
0283 }
0284 else {
0285 // no offset map, so just use the ID as an offset
0286 annObj.setEnd(new Long(endNodeId));
0287 }
0288 }
0289 catch(NumberFormatException nfe) {
0290 throw new XMLStreamException("Non-integer value found for EndNode", xsr
0291 .getLocation());
0292 }
0293
0294 String annotIdString = xsr.getAttributeValue(null, "Id");
0295 if(annotIdString == null) {
0296 if(requireAnnotationIds == null) {
0297 // if one annotation doesn't specify Id than all must
0298 requireAnnotationIds = Boolean.FALSE;
0299 }
0300 else {
0301 if(requireAnnotationIds.booleanValue()) {
0302 // if we were expecting an Id but didn't get one...
0303 throw new XMLStreamException(
0304 "New style GATE XML format requires that every annotation "
0305 + "specify its Id, but an annotation with no Id was found",
0306 xsr.getLocation());
0307 }
0308 }
0309 }
0310 else {
0311 // we have an ID attribute
0312 if(requireAnnotationIds == null) {
0313 // if one annotation specifies an Id then all must
0314 requireAnnotationIds = Boolean.TRUE;
0315 }
0316 else {
0317 if(!requireAnnotationIds.booleanValue()) {
0318 // if we were expecting not to have an Id but got one...
0319 throw new XMLStreamException(
0320 "Old style GATE XML format requires that no annotation "
0321 + "specifies its Id, but an annotation with an Id was found",
0322 xsr.getLocation());
0323 }
0324 }
0325 try {
0326 Integer annotationId = Integer.valueOf(annotIdString);
0327 if(allAnnotIds.contains(annotationId)) {
0328 throw new XMLStreamException("Annotation IDs must be unique "
0329 + "within an annotation set. Found duplicate ID", xsr
0330 .getLocation());
0331 }
0332 allAnnotIds.add(annotationId);
0333 annObj.setId(annotationId);
0334 }
0335 catch(NumberFormatException nfe) {
0336 throw new XMLStreamException("Non-integer annotation ID found", xsr
0337 .getLocation());
0338 }
0339 }
0340
0341 // get the features of this annotation
0342 annObj.setFM(readFeatureMap(xsr));
0343 // readFeatureMap leaves xsr on the </Annotation> tag
0344 collectedAnnots.add(annObj);
0345 }
0346
0347 // now process all found annotations.to add to the set
0348 Iterator<AnnotationObject> collectedAnnotsIt = collectedAnnots.iterator();
0349 while(collectedAnnotsIt.hasNext()) {
0350 AnnotationObject annObj = collectedAnnotsIt.next();
0351 try {
0352 if(annObj.getId() != null) {
0353 annotationSet.add(annObj.getId(), annObj.getStart(), annObj.getEnd(),
0354 annObj.getElemName(), annObj.getFM());
0355 }
0356 else {
0357 annotationSet.add(annObj.getStart(), annObj.getEnd(), annObj
0358 .getElemName(), annObj.getFM());
0359 }
0360 }
0361 catch(InvalidOffsetException ioe) {
0362 // really shouldn't happen, but could if we're not using an id
0363 // to offset map
0364 throw new XMLStreamException("Invalid offset when creating annotation "
0365 + annObj, ioe);
0366 }
0367 }
0368 return requireAnnotationIds;
0369 }
0370
0371 /**
0372 * Processes the TextWithNodes element from this XMLStreamReader,
0373 * returning the text content of the document. The supplied map is
0374 * updated with the offset of each Node element encountered. The
0375 * reader must be positioned on the starting TextWithNodes tag and
0376 * will be returned positioned on the corresponding closing tag.
0377 *
0378 * @param xsr
0379 * @param nodeIdToOffsetMap
0380 * @return
0381 */
0382 public static String readTextWithNodes(XMLStreamReader xsr,
0383 Map<Integer, Long> nodeIdToOffsetMap) throws XMLStreamException {
0384 StringBuffer textBuf = new StringBuffer(20480);
0385 int eventType;
0386 while((eventType = xsr.next()) != XMLStreamConstants.END_ELEMENT) {
0387 switch(eventType) {
0388 case XMLStreamConstants.CHARACTERS:
0389 textBuf.append(xsr.getTextCharacters(), xsr.getTextStart(), xsr
0390 .getTextLength());
0391 break;
0392
0393 case XMLStreamConstants.START_ELEMENT:
0394 // only Node elements allowed
0395 xsr.require(XMLStreamConstants.START_ELEMENT, null, "Node");
0396 String idString = xsr.getAttributeValue(null, "id");
0397 if(idString == null) {
0398 throw new XMLStreamException("Node element has no id", xsr
0399 .getLocation());
0400 }
0401 try {
0402 Integer id = Integer.valueOf(idString);
0403 Long offset = new Long(textBuf.length());
0404 nodeIdToOffsetMap.put(id, offset);
0405 }
0406 catch(NumberFormatException nfe) {
0407 throw new XMLStreamException("Node element must have "
0408 + "integer id", xsr.getLocation());
0409 }
0410
0411 // Node element must be empty
0412 if(xsr.next() != XMLStreamConstants.END_ELEMENT) {
0413 throw new XMLStreamException("Node element within TextWithNodes "
0414 + "must be empty.", xsr.getLocation());
0415 }
0416 break;
0417
0418 default:
0419 // do nothing - ignore comments, PIs...
0420 }
0421 }
0422 return textBuf.toString();
0423 }
0424
0425 /**
0426 * Processes a GateDocumentFeatures or Annotation element to build a
0427 * feature map. The element is expected to contain Feature children,
0428 * each with a Name and Value. The reader will be returned positioned
0429 * on the closing GateDocumentFeatures or Annotation tag.
0430 *
0431 * @param xsr
0432 * @return
0433 * @throws XMLStreamException
0434 */
0435 public static FeatureMap readFeatureMap(XMLStreamReader xsr)
0436 throws XMLStreamException {
0437 FeatureMap fm = Factory.newFeatureMap();
0438 while(xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
0439 xsr.require(XMLStreamConstants.START_ELEMENT, null, "Feature");
0440 Object featureName = null;
0441 Object featureValue = null;
0442 while(xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
0443 if("Name".equals(xsr.getLocalName())) {
0444 featureName = readFeatureNameOrValue(xsr);
0445 }
0446 else if("Value".equals(xsr.getLocalName())) {
0447 featureValue = readFeatureNameOrValue(xsr);
0448 }
0449 else {
0450 throw new XMLStreamException("Feature element should contain "
0451 + "only Name and Value children", xsr.getLocation());
0452 }
0453 }
0454 fm.put(featureName, featureValue);
0455 }
0456 return fm;
0457 }
0458
0459 /**
0460 * Read the name or value of a feature. The reader must be initially
0461 * positioned on an element with className and optional itemClassName
0462 * attributes, and text content convertable to this class. It will be
0463 * returned on the corresponding end tag.
0464 *
0465 * @param xsr the reader
0466 * @return the name or value represented by this element.
0467 * @throws XMLStreamException
0468 */
0469 static Object readFeatureNameOrValue(XMLStreamReader xsr)
0470 throws XMLStreamException {
0471 String className = xsr.getAttributeValue(null, "className");
0472 if(className == null) {
0473 className = "java.lang.String";
0474 }
0475 String itemClassName = xsr.getAttributeValue(null, "itemClassName");
0476 if(itemClassName == null) {
0477 itemClassName = "java.lang.String";
0478 }
0479 // get the string representation of the name/value
0480 StringBuffer stringRep = new StringBuffer(1024);
0481 int eventType;
0482 while((eventType = xsr.next()) != XMLStreamConstants.END_ELEMENT) {
0483 switch(eventType) {
0484 case XMLStreamConstants.CHARACTERS:
0485 stringRep.append(xsr.getTextCharacters(), xsr.getTextStart(), xsr
0486 .getTextLength());
0487 break;
0488
0489 case XMLStreamConstants.CDATA:
0490 stringRep.append(xsr.getTextCharacters(), xsr.getTextStart(), xsr
0491 .getTextLength());
0492 break;
0493
0494 case XMLStreamConstants.START_ELEMENT:
0495 throw new XMLStreamException("Elements not allowed within "
0496 + "feature name or value element.", xsr.getLocation());
0497
0498 default:
0499 // do nothing - ignore comments, PIs, etc.
0500 }
0501 }
0502
0503 // shortcut - if class name is java.lang.String, just return the
0504 // string representation directly
0505 if("java.lang.String".equals(className)) {
0506 return stringRep.toString();
0507 }
0508
0509 // otherwise, do some fancy reflection
0510 Class theClass = null;
0511 try {
0512 theClass = Class.forName(className, true, Gate.getClassLoader());
0513 }
0514 catch(ClassNotFoundException cnfe) {
0515 // give up and just return the String
0516 return stringRep.toString();
0517 }
0518
0519 if(java.util.Collection.class.isAssignableFrom(theClass)) {
0520 Class itemClass = null;
0521 Constructor itemConstructor = null;
0522 Collection featObject = null;
0523
0524 boolean addItemAsString = false;
0525
0526 // construct the collection object to use as the feature value
0527 try {
0528 featObject = (Collection)theClass.newInstance();
0529 }
0530 // if we can't instantiate the collection class at all, give up
0531 // and return the value as a string
0532 catch(IllegalAccessException iae) {
0533 return stringRep.toString();
0534 }
0535 catch(InstantiationException ie) {
0536 return stringRep.toString();
0537 }
0538
0539 // common case - itemClass *is* java.lang.String, so we can
0540 // avoid all the reflection
0541 if("java.lang.String".equals(itemClassName)) {
0542 addItemAsString = true;
0543 }
0544 else {
0545 try {
0546 itemClass = Class.forName(itemClassName, true, Gate.getClassLoader());
0547 // Let's detect if itemClass takes a constructor with a String
0548 // as param
0549 Class[] paramsArray = new Class[1];
0550 paramsArray[0] = java.lang.String.class;
0551 itemConstructor = itemClass.getConstructor(paramsArray);
0552 }
0553 catch(ClassNotFoundException cnfex) {
0554 Out.prln("Warning: Item class " + itemClassName + " not found."
0555 + "Adding items as Strings");
0556 addItemAsString = true;
0557 }
0558 catch(NoSuchMethodException nsme) {
0559 addItemAsString = true;
0560 }
0561 catch(SecurityException se) {
0562 addItemAsString = true;
0563 }// End try
0564 }
0565
0566 StringTokenizer strTok = new StringTokenizer(stringRep.toString(), ";");
0567 Object[] params = new Object[1];
0568 Object itemObj = null;
0569 while(strTok.hasMoreTokens()) {
0570 String itemStrRep = strTok.nextToken();
0571 if(addItemAsString)
0572 featObject.add(itemStrRep);
0573 else {
0574 params[0] = itemStrRep;
0575 try {
0576 itemObj = itemConstructor.newInstance(params);
0577 }
0578 catch(Exception e) {
0579 throw new XMLStreamException("An item(" + itemStrRep
0580 + ") does not comply with its class" + " definition("
0581 + itemClassName + ")", xsr.getLocation());
0582 }// End try
0583 featObject.add(itemObj);
0584 }// End if
0585 }// End while
0586
0587 return featObject;
0588 }// End if
0589
0590 // If currentfeatClass is not a Collection and not String, test to
0591 // see if it has a constructor that takes a String as param
0592 Class[] params = new Class[1];
0593 params[0] = java.lang.String.class;
0594 try {
0595 Constructor featConstr = theClass.getConstructor(params);
0596 Object[] featConstrParams = new Object[1];
0597 featConstrParams[0] = stringRep.toString();
0598 Object featObject = featConstr.newInstance(featConstrParams);
0599 return featObject;
0600 }
0601 catch(Exception e) {
0602 return stringRep.toString();
0603 }// End try
0604 }
0605
0606 // ///// Reading XCES /////
0607
0608 // constants
0609 /**
0610 * Version of XCES that this class can handle.
0611 */
0612 public static final String XCES_VERSION = "1.0";
0613
0614 /**
0615 * XCES namespace URI.
0616 */
0617 public static final String XCES_NAMESPACE = "http://www.xces.org/schema/2003";
0618
0619 /**
0620 * Read XML data in <a href="http://www.xces.org/">XCES</a> format
0621 * from the given stream and add the corresponding annotations to the
0622 * given annotation set. This method does not close the stream, this
0623 * is the responsibility of the caller.
0624 *
0625 * @param is the input stream to read from, which will <b>not</b> be
0626 * closed before returning.
0627 * @param as the annotation set to read into.
0628 */
0629 public static void readXces(InputStream is, AnnotationSet as)
0630 throws XMLStreamException {
0631 if(inputFactory == null) {
0632 inputFactory = XMLInputFactory.newInstance();
0633 }
0634 XMLStreamReader xsr = inputFactory.createXMLStreamReader(is);
0635 try {
0636 nextTagSkipDTD(xsr);
0637 readXces(xsr, as);
0638 }
0639 finally {
0640 xsr.close();
0641 }
0642 }
0643
0644 /**
0645 * A copy of the nextTag algorithm from the XMLStreamReader javadocs,
0646 * but which also skips over DTD events as well as whitespace,
0647 * comments and PIs.
0648 *
0649 * @param xsr the reader to advance
0650 * @return {@link XMLStreamConstants#START_ELEMENT} or
0651 * {@link XMLStreamConstants#END_ELEMENT} for the next tag.
0652 * @throws XMLStreamException
0653 */
0654 private static int nextTagSkipDTD(XMLStreamReader xsr)
0655 throws XMLStreamException {
0656 int eventType = xsr.next();
0657 while((eventType == XMLStreamConstants.CHARACTERS && xsr.isWhiteSpace())
0658 || (eventType == XMLStreamConstants.CDATA && xsr.isWhiteSpace())
0659 || eventType == XMLStreamConstants.SPACE
0660 || eventType == XMLStreamConstants.PROCESSING_INSTRUCTION
0661 || eventType == XMLStreamConstants.COMMENT
0662 || eventType == XMLStreamConstants.DTD) {
0663 eventType = xsr.next();
0664 }
0665 if(eventType != XMLStreamConstants.START_ELEMENT
0666 && eventType != XMLStreamConstants.END_ELEMENT) {
0667 throw new XMLStreamException("expected start or end tag", xsr
0668 .getLocation());
0669 }
0670 return eventType;
0671 }
0672
0673 /**
0674 * Read XML data in <a href="http://www.xces.org/">XCES</a> format
0675 * from the given reader and add the corresponding annotations to the
0676 * given annotation set. The reader must be positioned on the starting
0677 * <code>cesAna</code> tag and will be left pointing to the
0678 * corresponding end tag.
0679 *
0680 * @param reader the XMLStreamReader to read from.
0681 * @param as the annotation set to read into.
0682 * @throws XMLStreamException
0683 */
0684 public static void readXces(XMLStreamReader xsr, AnnotationSet as)
0685 throws XMLStreamException {
0686 xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "cesAna");
0687
0688 // Set of all annotation IDs in this set.
0689 Set<Integer> allAnnotIds = new TreeSet<Integer>();
0690 // pre-populate with the IDs of any existing annotations in the set
0691 for(Annotation a : as) {
0692 allAnnotIds.add(a.getId());
0693 }
0694
0695 // lists to collect the annotations in before adding them to the
0696 // set. We collect the annotations that specify and ID (via
0697 // struct/@n) in one list and those that don't in another, so we can
0698 // add the identified ones first, then the others will take the next
0699 // available ID
0700 List<AnnotationObject> collectedIdentifiedAnnots = new ArrayList<AnnotationObject>();
0701 List<AnnotationObject> collectedNonIdentifiedAnnots = new ArrayList<AnnotationObject>();
0702 while(xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
0703 xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "struct");
0704 AnnotationObject annObj = new AnnotationObject();
0705 annObj.setElemName(xsr.getAttributeValue(null, "type"));
0706 try {
0707 int from = Integer.parseInt(xsr.getAttributeValue(null, "from"));
0708 annObj.setStart(new Long(from));
0709 }
0710 catch(NumberFormatException nfe) {
0711 throw new XMLStreamException(
0712 "Non-integer value found for struct/@from", xsr.getLocation());
0713 }
0714
0715 try {
0716 int to = Integer.parseInt(xsr.getAttributeValue(null, "to"));
0717 annObj.setEnd(new Long(to));
0718 }
0719 catch(NumberFormatException nfe) {
0720 throw new XMLStreamException("Non-integer value found for struct/@to",
0721 xsr.getLocation());
0722 }
0723
0724 String annotIdString = xsr.getAttributeValue(null, "n");
0725 if(annotIdString != null) {
0726 try {
0727 Integer annotationId = Integer.valueOf(annotIdString);
0728 if(allAnnotIds.contains(annotationId)) {
0729 throw new XMLStreamException("Annotation IDs must be unique "
0730 + "within an annotation set. Found duplicate ID", xsr
0731 .getLocation());
0732 }
0733 allAnnotIds.add(annotationId);
0734 annObj.setId(annotationId);
0735 }
0736 catch(NumberFormatException nfe) {
0737 throw new XMLStreamException("Non-integer annotation ID found", xsr
0738 .getLocation());
0739 }
0740 }
0741
0742 // get the features of this annotation
0743 annObj.setFM(readXcesFeatureMap(xsr));
0744 // readFeatureMap leaves xsr on the </Annotation> tag
0745 if(annObj.getId() != null) {
0746 collectedIdentifiedAnnots.add(annObj);
0747 }
0748 else {
0749 collectedNonIdentifiedAnnots.add(annObj);
0750 }
0751 }
0752
0753 // finished reading, add the annotations to the set
0754 AnnotationObject a = null;
0755 try {
0756 // first the ones that specify an ID
0757 Iterator<AnnotationObject> it = collectedIdentifiedAnnots.iterator();
0758 while(it.hasNext()) {
0759 a = it.next();
0760 as.add(a.getId(), a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
0761 }
0762 // next the ones that don't
0763 it = collectedNonIdentifiedAnnots.iterator();
0764 while(it.hasNext()) {
0765 a = it.next();
0766 as.add(a.getStart(), a.getEnd(), a.getElemName(), a.getFM());
0767 }
0768 }
0769 catch(InvalidOffsetException ioe) {
0770 throw new XMLStreamException("Invalid offset when creating annotation "
0771 + a, ioe);
0772 }
0773 }
0774
0775 /**
0776 * Processes a struct element to build a feature map. The element is
0777 * expected to contain feat children, each with name and value
0778 * attributes. The reader will be returned positioned on the closing
0779 * struct tag.
0780 *
0781 * @param xsr
0782 * @return
0783 * @throws XMLStreamException
0784 */
0785 public static FeatureMap readXcesFeatureMap(XMLStreamReader xsr)
0786 throws XMLStreamException {
0787 FeatureMap fm = Factory.newFeatureMap();
0788 while(xsr.nextTag() == XMLStreamConstants.START_ELEMENT) {
0789 xsr.require(XMLStreamConstants.START_ELEMENT, XCES_NAMESPACE, "feat");
0790 String featureName = xsr.getAttributeValue(null, "name");
0791 Object featureValue = xsr.getAttributeValue(null, "value");
0792
0793 fm.put(featureName, featureValue);
0794 // read the (possibly virtual) closing tag of the feat element
0795 xsr.nextTag();
0796 xsr.require(XMLStreamConstants.END_ELEMENT, XCES_NAMESPACE, "feat");
0797 }
0798 return fm;
0799 }
0800
0801 // ////////// Writing methods ////////////
0802
0803 private static XMLOutputFactory outputFactory = null;
0804
0805 /**
0806 * Returns a string containing the specified document in GATE XML
0807 * format.
0808 *
0809 * @param doc the document
0810 */
0811 public static String toXml(Document doc) {
0812 try {
0813 if(outputFactory == null) {
0814 outputFactory = XMLOutputFactory.newInstance();
0815 }
0816 StringWriter sw = new StringWriter(doc.getContent().size().intValue()
0817 * DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR);
0818 XMLStreamWriter xsw = outputFactory.createXMLStreamWriter(sw);
0819
0820 // start the document
0821 if(doc instanceof TextualDocument) {
0822 xsw.writeStartDocument(((TextualDocument)doc).getEncoding(), "1.0");
0823 }
0824 else {
0825 xsw.writeStartDocument("1.0");
0826 }
0827 newLine(xsw);
0828 writeDocument(doc, xsw, "");
0829 xsw.close();
0830
0831 return sw.toString();
0832 }
0833 catch(XMLStreamException xse) {
0834 throw new GateRuntimeException("Error converting document to XML", xse);
0835 }
0836 }
0837
0838 /**
0839 * Write the specified GATE document to a File.
0840 *
0841 * @param doc the document to write
0842 * @param file the file to write it to
0843 * @throws XMLStreamException
0844 * @throws IOException
0845 */
0846 public static void writeDocument(Document doc, File file)
0847 throws XMLStreamException, IOException {
0848 writeDocument(doc, file, "");
0849 }
0850
0851 /**
0852 * Write the specified GATE document to a File, optionally putting the
0853 * XML in a namespace.
0854 *
0855 * @param doc the document to write
0856 * @param file the file to write it to
0857 * @param namespaceURI the namespace URI to use for the XML elements.
0858 * Must not be null, but can be the empty string if no
0859 * namespace is desired.
0860 * @throws XMLStreamException
0861 * @throws IOException
0862 */
0863 public static void writeDocument(Document doc, File file, String namespaceURI)
0864 throws XMLStreamException, IOException {
0865 if(outputFactory == null) {
0866 outputFactory = XMLOutputFactory.newInstance();
0867 }
0868
0869 XMLStreamWriter xsw = null;
0870 OutputStream outputStream = new FileOutputStream(file);
0871 try {
0872 if(doc instanceof TextualDocument) {
0873 xsw = outputFactory.createXMLStreamWriter(outputStream,
0874 ((TextualDocument)doc).getEncoding());
0875 xsw.writeStartDocument(((TextualDocument)doc).getEncoding(), "1.0");
0876 }
0877 else {
0878 xsw = outputFactory.createXMLStreamWriter(outputStream);
0879 xsw.writeStartDocument("1.0");
0880 }
0881 newLine(xsw);
0882
0883 writeDocument(doc, xsw, namespaceURI);
0884 }
0885 finally {
0886 if(xsw != null) {
0887 xsw.close();
0888 }
0889 outputStream.close();
0890 }
0891 }
0892
0893 /**
0894 * Write the specified GATE Document to an XMLStreamWriter. This
0895 * method writes just the GateDocument element - the XML declaration
0896 * must be filled in by the caller if required.
0897 *
0898 * @param doc the Document to write
0899 * @param annotationSets the annotations to include. If the map
0900 * contains an entry for the key <code>null</code>, this
0901 * will be treated as the default set. All other entries are
0902 * treated as named annotation sets.
0903 * @param xsw the StAX XMLStreamWriter to use for output
0904 * @throws GateException if an error occurs during writing
0905 */
0906 public static void writeDocument(Document doc,
0907 Map<String, Collection<Annotation>> annotationSets,
0908 XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
0909 xsw.setDefaultNamespace(namespaceURI);
0910 xsw.writeStartElement(namespaceURI, "GateDocument");
0911 if(namespaceURI.length() > 0) {
0912 xsw.writeDefaultNamespace(namespaceURI);
0913 }
0914 newLine(xsw);
0915 // features
0916 xsw.writeComment(" The document's features");
0917 newLine(xsw);
0918 newLine(xsw);
0919 xsw.writeStartElement(namespaceURI, "GateDocumentFeatures");
0920 newLine(xsw);
0921 writeFeatures(doc.getFeatures(), xsw, namespaceURI);
0922 xsw.writeEndElement(); // GateDocumentFeatures
0923 newLine(xsw);
0924 // text with nodes
0925 xsw.writeComment(" The document content area with serialized nodes ");
0926 newLine(xsw);
0927 newLine(xsw);
0928 writeTextWithNodes(doc, annotationSets.values(), xsw, namespaceURI);
0929 newLine(xsw);
0930 // Serialize as XML all document's annotation sets
0931 // Serialize the default AnnotationSet
0932 StatusListener sListener = (StatusListener)gate.Gate
0933 .getListeners().get("gate.event.StatusListener");
0934 if(annotationSets.containsKey(null)) {
0935 if(sListener != null)
0936 sListener.statusChanged("Saving the default annotation set ");
0937 xsw.writeComment(" The default annotation set ");
0938 newLine(xsw);
0939 newLine(xsw);
0940 writeAnnotationSet(annotationSets.get(null), null, xsw, namespaceURI);
0941 newLine(xsw);
0942 }
0943
0944 // Serialize all others AnnotationSets
0945 // namedAnnotSets is a Map containing all other named Annotation
0946 // Sets.
0947 Iterator<String> iter = annotationSets.keySet().iterator();
0948 while(iter.hasNext()) {
0949 String annotationSetName = iter.next();
0950 // ignore the null entry, if present - we've already handled that
0951 // above
0952 if(annotationSetName != null) {
0953 Collection<Annotation> annots = annotationSets.get(annotationSetName);
0954 xsw.writeComment(" Named annotation set ");
0955 newLine(xsw);
0956 newLine(xsw);
0957 // Serialize it as XML
0958 if(sListener != null)
0959 sListener.statusChanged("Saving " + annotationSetName
0960 + " annotation set ");
0961 writeAnnotationSet(annots, annotationSetName, xsw, namespaceURI);
0962 newLine(xsw);
0963 }// End if
0964 }// End while
0965
0966 // close the GateDocument element
0967 xsw.writeEndElement();
0968 newLine(xsw);
0969 }
0970
0971 /**
0972 * Write the specified GATE Document to an XMLStreamWriter. This
0973 * method writes just the GateDocument element - the XML declaration
0974 * must be filled in by the caller if required. This method writes all
0975 * the annotations in all the annotation sets on the document. To
0976 * write just specific annotations, use
0977 * {@link #writeDocument(Document, Map, XMLStreamWriter, String)}.
0978 */
0979 public static void writeDocument(Document doc, XMLStreamWriter xsw,
0980 String namespaceURI) throws XMLStreamException {
0981 Map<String, Collection<Annotation>> asMap = new HashMap<String, Collection<Annotation>>();
0982 asMap.put(null, doc.getAnnotations());
0983 if(doc.getNamedAnnotationSets() != null) {
0984 asMap.putAll(doc.getNamedAnnotationSets());
0985 }
0986 writeDocument(doc, asMap, xsw, namespaceURI);
0987 }
0988
0989 /**
0990 * Writes the given annotation set to an XMLStreamWriter as GATE XML
0991 * format. The Name attribute of the generated AnnotationSet element
0992 * is set to the default value, i.e. <code>annotations.getName</code>.
0993 *
0994 * @param annotations the annotation set to write
0995 * @param xsw the writer to use for output
0996 * @param namespaceURI
0997 * @throws XMLStreamException
0998 */
0999 public static void writeAnnotationSet(AnnotationSet annotations,
1000 XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
1001 writeAnnotationSet((Collection)annotations, annotations.getName(), xsw,
1002 namespaceURI);
1003 }
1004
1005 /**
1006 * Writes the given annotation set to an XMLStreamWriter as GATE XML
1007 * format. The value for the Name attribute of the generated
1008 * AnnotationSet element is given by <code>asName</code>.
1009 *
1010 * @param annotations the annotation set to write
1011 * @param asName the name under which to write the annotation set.
1012 * <code>null</code> means that no name will be used.
1013 * @param xsw the writer to use for output
1014 * @param namespaceURI
1015 * @throws XMLStreamException
1016 */
1017 public static void writeAnnotationSet(Collection<Annotation> annotations,
1018 String asName, XMLStreamWriter xsw, String namespaceURI)
1019 throws XMLStreamException {
1020 xsw.writeStartElement(namespaceURI, "AnnotationSet");
1021 if(asName != null) {
1022 xsw.writeAttribute("Name", asName);
1023 }
1024 newLine(xsw);
1025
1026 if(annotations != null) {
1027 Iterator<Annotation> iterator = annotations.iterator();
1028 while(iterator.hasNext()) {
1029 Annotation annot = iterator.next();
1030 xsw.writeStartElement(namespaceURI, "Annotation");
1031 xsw.writeAttribute("Id", String.valueOf(annot.getId()));
1032 xsw.writeAttribute("Type", annot.getType());
1033 xsw.writeAttribute("StartNode", String.valueOf(annot.getStartNode()
1034 .getOffset()));
1035 xsw.writeAttribute("EndNode", String.valueOf(annot.getEndNode()
1036 .getOffset()));
1037 newLine(xsw);
1038 writeFeatures(annot.getFeatures(), xsw, namespaceURI);
1039 xsw.writeEndElement();
1040 newLine(xsw);
1041 }
1042 }
1043 // end AnnotationSet element
1044 xsw.writeEndElement();
1045 newLine(xsw);
1046 }
1047
1048 /**
1049 * Retained for binary compatibility, new code should call the
1050 * <code>Collection<Annotation></code> version instead.
1051 */
1052 public static void writeAnnotationSet(AnnotationSet annotations,
1053 String asName, XMLStreamWriter xsw, String namespaceURI)
1054 throws XMLStreamException {
1055 writeAnnotationSet((Collection)annotations, asName, xsw, namespaceURI);
1056 }
1057
1058 /**
1059 * Writes the content of the given document to an XMLStreamWriter as a
1060 * mixed content element called "TextWithNodes". At each point where
1061 * there is the start or end of an annotation in any annotation set on
1062 * the document, a "Node" element is written with an "id" feature
1063 * whose value is the offset of that node.
1064 *
1065 * @param doc the document whose content is to be written
1066 * @param annotationSets the annotations for which nodes are required.
1067 * This is a collection of collections.
1068 * @param xsw the {@link XMLStreamWriter} to write to.
1069 * @param namespaceURI the namespace URI. May be empty but may not be
1070 * null.
1071 * @throws XMLStreamException
1072 */
1073 public static void writeTextWithNodes(Document doc,
1074 Collection<Collection<Annotation>> annotationSets,
1075 XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
1076 String aText = doc.getContent().toString();
1077 // no text, so return an empty element
1078 if(aText == null) {
1079 xsw.writeEmptyElement(namespaceURI, "TextWithNodes");
1080 return;
1081 }
1082
1083 // build a set of all the offsets where Nodes are required
1084 TreeSet<Long> offsetsSet = new TreeSet<Long>();
1085 if(annotationSets != null) {
1086 for(Collection<Annotation> set : annotationSets) {
1087 if(set != null) {
1088 for(Annotation annot : set) {
1089 offsetsSet.add(annot.getStartNode().getOffset());
1090 offsetsSet.add(annot.getEndNode().getOffset());
1091 }
1092 }
1093 }
1094 }
1095
1096 // write the TextWithNodes element
1097 char[] textArray = aText.toCharArray();
1098 replaceXMLIllegalCharacters(textArray);
1099 xsw.writeStartElement(namespaceURI, "TextWithNodes");
1100 int lastNodeOffset = 0;
1101 // offsetsSet iterator is in ascending order of offset, as it is a
1102 // SortedSet
1103 Iterator<Long> offsetsIterator = offsetsSet.iterator();
1104 while(offsetsIterator.hasNext()) {
1105 int offset = offsetsIterator.next().intValue();
1106 // write characters since the last node output
1107 writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset, offset
1108 - lastNodeOffset));
1109 xsw.writeEmptyElement(namespaceURI, "Node");
1110 xsw.writeAttribute("id", String.valueOf(offset));
1111 lastNodeOffset = offset;
1112 }
1113 // write any remaining text after the last node
1114 writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset,
1115 textArray.length - lastNodeOffset));
1116 // and the closing TextWithNodes
1117 xsw.writeEndElement();
1118 }
1119
1120 /**
1121 * Write a TextWithNodes section containing nodes for all annotations
1122 * in the given document.
1123 *
1124 * @see #writeTextWithNodes(Document, Collection, XMLStreamWriter,
1125 * String)
1126 */
1127 public static void writeTextWithNodes(Document doc, XMLStreamWriter xsw,
1128 String namespaceURI) throws XMLStreamException {
1129 Collection<Collection<Annotation>> annotationSets = new ArrayList<Collection<Annotation>>();
1130 annotationSets.add(doc.getAnnotations());
1131 if(doc.getNamedAnnotationSets() != null) {
1132 annotationSets.addAll(doc.getNamedAnnotationSets().values());
1133 }
1134 writeTextWithNodes(doc, annotationSets, xsw, namespaceURI);
1135 }
1136
1137 /**
1138 * Replace any characters in the given buffer that are illegal in XML
1139 * with spaces. Characters that are illegal in XML are:
1140 * <ul>
1141 * <li>Control characters U+0000 to U+001F, <i>except</i> U+0009,
1142 * U+000A and U+000D, which are permitted.</li>
1143 * <li><i>Unpaired</i> surrogates U+D800 to U+D8FF (valid surrogate
1144 * pairs are OK).</li>
1145 * <li>U+FFFE and U+FFFF (only allowed as part of the Unicode byte
1146 * order mark).</li>
1147 * </ul>
1148 *
1149 * @param buf the buffer to process
1150 */
1151 static void replaceXMLIllegalCharacters(char[] buf) {
1152 ArrayCharSequence bufSequence = new ArrayCharSequence(buf);
1153 for(int i = 0; i < buf.length; i++) {
1154 if(isInvalidXmlChar(bufSequence, i)) {
1155 buf[i] = INVALID_CHARACTER_REPLACEMENT;
1156 }
1157 }
1158 }
1159
1160 /**
1161 * Return a string containing the same characters as the supplied
1162 * string, except that any characters that are illegal in XML will be
1163 * replaced with spaces. Characters that are illegal in XML are:
1164 * <ul>
1165 * <li>Control characters U+0000 to U+001F, <i>except</i> U+0009,
1166 * U+000A and U+000D, which are permitted.</li>
1167 * <li><i>Unpaired</i> surrogates U+D800 to U+D8FF (valid surrogate
1168 * pairs are OK).</li>
1169 * <li>U+FFFE and U+FFFF (only allowed as part of the Unicode byte
1170 * order mark).</li>
1171 * </ul>
1172 *
1173 * A new string is only created if required - if the supplied string
1174 * contains no illegal characters then the same object is returned.
1175 *
1176 * @param str the string to process
1177 * @return <code>str</code>, unless it contains illegal characters
1178 * in which case a new string the same as str but with the
1179 * illegal characters replaced by spaces.
1180 */
1181 static String replaceXMLIllegalCharactersInString(String str) {
1182 StringBuilder builder = null;
1183 for(int i = 0; i < str.length(); i++) {
1184 if(isInvalidXmlChar(str, i)) {
1185 // lazily create the StringBuilder
1186 if(builder == null) {
1187 builder = new StringBuilder(str.substring(0, i));
1188 }
1189 builder.append(INVALID_CHARACTER_REPLACEMENT);
1190 }
1191 else if(builder != null) {
1192 builder.append(str.charAt(i));
1193 }
1194 }
1195
1196 if(builder == null) {
1197 // no illegal characters were found
1198 return str;
1199 }
1200 else {
1201 return builder.toString();
1202 }
1203 }
1204
1205 /**
1206 * Check whether a character is illegal in XML.
1207 *
1208 * @param buf the character sequence in which to look (must not be
1209 * null)
1210 * @param i the index of the character to check (must be within the
1211 * valid range of characters in <code>buf</code>)
1212 */
1213 static final boolean isInvalidXmlChar(CharSequence buf, int i) {
1214 // illegal control character
1215 if(buf.charAt(i) <= 0x0008 || buf.charAt(i) == 0x000B
1216 || buf.charAt(i) == 0x000C
1217 || (buf.charAt(i) >= 0x000E && buf.charAt(i) <= 0x001F)) {
1218 return true;
1219 }
1220
1221 // buf.charAt(i) is a high surrogate...
1222 if(buf.charAt(i) >= 0xD800 && buf.charAt(i) <= 0xDBFF) {
1223 // if we're not at the end of the buffer we can look ahead
1224 if(i < buf.length() - 1) {
1225 // followed by a low surrogate is OK
1226 if(buf.charAt(i + 1) >= 0xDC00 && buf.charAt(i + 1) <= 0xDFFF) {
1227 return false;
1228 }
1229 }
1230
1231 // at the end of the buffer, or not followed by a low surrogate is
1232 // not OK.
1233 return true;
1234 }
1235
1236 // buf.charAt(i) is a low surrogate...
1237 if(buf.charAt(i) >= 0xDC00 && buf.charAt(i) <= 0xDFFF) {
1238 // if we're not at the start of the buffer we can look behind
1239 if(i > 0) {
1240 // preceded by a high surrogate is OK
1241 if(buf.charAt(i - 1) >= 0xD800 && buf.charAt(i - 1) <= 0xDBFF) {
1242 return false;
1243 }
1244 }
1245
1246 // at the start of the buffer, or not preceded by a high surrogate
1247 // is not OK
1248 return true;
1249 }
1250
1251 // buf.charAt(i) is a BOM character
1252 if(buf.charAt(i) == 0xFFFE || buf.charAt(i) == 0xFFFF) {
1253 return true;
1254 }
1255
1256 // anything else is OK
1257 return false;
1258 }
1259
1260 /**
1261 * Write a feature map to the given XMLStreamWriter. The map is output
1262 * as a sequence of "Feature" elements, each having "Name" and "Value"
1263 * children. Note that there is no enclosing element - the caller must
1264 * write the enclosing "GateDocumentFeatures" or "Annotation" element.
1265 * Characters in feature values that are illegal in XML are replaced
1266 * by {@link #INVALID_CHARACTER_REPLACEMENT} (a space). Feature
1267 * <i>names</i> are not modified - an illegal character in a feature
1268 * name will cause the serialization to fail.
1269 *
1270 * @param features
1271 * @param xsw
1272 * @param namespaceURI
1273 * @throws XMLStreamException
1274 */
1275 public static void writeFeatures(FeatureMap features, XMLStreamWriter xsw,
1276 String namespaceURI) throws XMLStreamException {
1277 if(features == null) {
1278 return;
1279 }
1280
1281 Set keySet = features.keySet();
1282 Iterator keySetIterator = keySet.iterator();
1283 while(keySetIterator.hasNext()) {
1284 Object key = keySetIterator.next();
1285 Object value = features.get(key);
1286 if(key != null && value != null) {
1287 String keyClassName = null;
1288 String keyItemClassName = null;
1289 String valueClassName = null;
1290 String valueItemClassName = null;
1291 String key2String = key.toString();
1292 String value2String = value.toString();
1293 Object item = null;
1294 // Test key if it is String, Number or Collection
1295 if(key instanceof java.lang.String || key instanceof java.lang.Number
1296 || key instanceof java.util.Collection)
1297 keyClassName = key.getClass().getName();
1298 // Test value if it is String, Number or Collection
1299 if(value instanceof java.lang.String
1300 || value instanceof java.lang.Number
1301 || value instanceof java.util.Collection)
1302 valueClassName = value.getClass().getName();
1303 // Features and values that are not Strings, Numbers or
1304 // collections
1305 // will be discarded.
1306 if(keyClassName == null || valueClassName == null) continue;
1307
1308 // If key is collection serialize the collection in a specific
1309 // format
1310 if(key instanceof java.util.Collection) {
1311 StringBuffer keyStrBuff = new StringBuffer();
1312 Iterator iter = ((Collection)key).iterator();
1313 if(iter.hasNext()) {
1314 item = iter.next();
1315 if(item instanceof java.lang.Number)
1316 keyItemClassName = item.getClass().getName();
1317 else keyItemClassName = String.class.getName();
1318 keyStrBuff.append(item.toString());
1319 }// End if
1320 while(iter.hasNext()) {
1321 item = iter.next();
1322 keyStrBuff.append(";").append(item.toString());
1323 }// End while
1324 key2String = keyStrBuff.toString();
1325 }// End if
1326
1327 // If key is collection serialize the colection in a specific
1328 // format
1329 if(value instanceof java.util.Collection) {
1330 StringBuffer valueStrBuff = new StringBuffer();
1331 Iterator iter = ((Collection)value).iterator();
1332 if(iter.hasNext()) {
1333 item = iter.next();
1334 if(item instanceof java.lang.Number)
1335 valueItemClassName = item.getClass().getName();
1336 else valueItemClassName = String.class.getName();
1337 valueStrBuff.append(item.toString());
1338 }// End if
1339 while(iter.hasNext()) {
1340 item = iter.next();
1341 valueStrBuff.append(";").append(item.toString());
1342 }// End while
1343 value2String = valueStrBuff.toString();
1344 }// End if
1345
1346 xsw.writeStartElement(namespaceURI, "Feature");
1347 xsw.writeCharacters("\n ");
1348
1349 // write the Name
1350 xsw.writeStartElement(namespaceURI, "Name");
1351 if(keyClassName != null) {
1352 xsw.writeAttribute("className", keyClassName);
1353 }
1354 if(keyItemClassName != null) {
1355 xsw.writeAttribute("itemClassName", keyItemClassName);
1356 }
1357 xsw.writeCharacters(key2String);
1358 xsw.writeEndElement();
1359 xsw.writeCharacters("\n ");
1360
1361 // write the Value
1362 xsw.writeStartElement(namespaceURI, "Value");
1363 if(valueClassName != null) {
1364 xsw.writeAttribute("className", valueClassName);
1365 }
1366 if(valueItemClassName != null) {
1367 xsw.writeAttribute("itemClassName", valueItemClassName);
1368 }
1369 writeCharactersOrCDATA(xsw,
1370 replaceXMLIllegalCharactersInString(value2String));
1371 xsw.writeEndElement();
1372 newLine(xsw);
1373
1374 // close the Feature element
1375 xsw.writeEndElement();
1376 newLine(xsw);
1377 }
1378 }
1379 }
1380
1381 /**
1382 * Convenience method to write a single new line to the given writer.
1383 *
1384 * @param xsw the XMLStreamWriter to write to.
1385 * @throws XMLStreamException
1386 */
1387 static void newLine(XMLStreamWriter xsw) throws XMLStreamException {
1388 xsw.writeCharacters("\n");
1389 }
1390
1391 /**
1392 * The regular expression pattern that will match the end of a CDATA
1393 * section.
1394 */
1395 private static Pattern CDATA_END_PATTERN = Pattern.compile("\\]\\]>");
1396
1397 /**
1398 * Write the given string to the given writer, using either
1399 * writeCharacters or, if there are more than a few less than signs in
1400 * the string (e.g. if it is an XML fragment itself), write it with
1401 * writeCData. This method properly handles the case where the string
1402 * contains other CDATA sections - as a CDATA section cannot contain
1403 * the CDATA end marker <code>]]></code>, we split the output CDATA
1404 * at any occurrences of this marker and write the marker using a
1405 * normal writeCharacters call in between.
1406 *
1407 * @param xsw the writer to write to
1408 * @param string the string to write
1409 * @throws XMLStreamException
1410 */
1411 static void writeCharactersOrCDATA(XMLStreamWriter xsw, String string)
1412 throws XMLStreamException {
1413 if(containsEnoughLTs(string)) {
1414 Matcher m = CDATA_END_PATTERN.matcher(string);
1415 int startFrom = 0;
1416 while(m.find()) {
1417 // we found a CDATA end marker, so write everything up to the
1418 // marker as CDATA...
1419 xsw.writeCData(string.substring(startFrom, m.start()));
1420 // then write the marker as characters
1421 xsw.writeCharacters("]]>");
1422 startFrom = m.end();
1423 }
1424
1425 if(startFrom == 0) {
1426 // no "]]>" in the string, the normal case
1427 xsw.writeCData(string);
1428 }
1429 else if(startFrom < string.length()) {
1430 // there is some trailing text after the last ]]>
1431 xsw.writeCData(string.substring(startFrom));
1432 }
1433 // else the last ]]> was the end of the string, so nothing more to
1434 // do.
1435 }
1436 else {
1437 // if fewer '<' characters, just writeCharacters as normal
1438 xsw.writeCharacters(string);
1439 }
1440 }
1441
1442 /**
1443 * Checks whether the given string contains at least
1444 * <code>LT_THRESHOLD</code> < characters.
1445 */
1446 private static boolean containsEnoughLTs(String string) {
1447 int numLTs = 0;
1448 int index = -1;
1449 while((index = string.indexOf('<', index + 1)) >= 0) {
1450 numLTs++;
1451 if(numLTs >= LT_THRESHOLD) {
1452 return true;
1453 }
1454 }
1455
1456 return false;
1457 }
1458
1459 // ///// Writing XCES /////
1460
1461 /**
1462 * Comparator that compares annotations based on their offsets; when
1463 * two annotations start at the same location, the longer one is
1464 * considered to come first in the ordering.
1465 */
1466 public static final Comparator<Annotation> LONGEST_FIRST_OFFSET_COMPARATOR = new Comparator<Annotation>() {
1467 public int compare(Annotation left, Annotation right) {
1468 long loffset = left.getStartNode().getOffset().longValue();
1469 long roffset = right.getStartNode().getOffset().longValue();
1470 if(loffset == roffset) {
1471 // if the start offsets are the same compare end
1472 // offsets.
1473 // the largest offset should come first
1474 loffset = left.getEndNode().getOffset().longValue();
1475 roffset = right.getEndNode().getOffset().longValue();
1476 if(loffset == roffset) {
1477 return left.getId() - right.getId();
1478 }
1479 else {
1480 return (int)(roffset - loffset);
1481 }
1482 }
1483 return (int)(loffset - roffset);
1484 }
1485 };
1486
1487 /**
1488 * Save the content of a document to the given output stream. Since
1489 * XCES content files are plain text (not XML), XML-illegal characters
1490 * are not replaced when writing. The stream is <i>not</i> closed by
1491 * this method, that is left to the caller.
1492 *
1493 * @param doc the document to save
1494 * @param out the stream to write to
1495 * @param encoding the character encoding to use. If null, defaults to
1496 * UTF-8
1497 */
1498 public static void writeXcesContent(Document doc, OutputStream out,
1499 String encoding) throws IOException {
1500 if(encoding == null) {
1501 encoding = "UTF-8";
1502 }
1503
1504 String documentContent = doc.getContent().toString();
1505
1506 OutputStreamWriter osw = new OutputStreamWriter(out, encoding);
1507 BufferedWriter writer = new BufferedWriter(osw);
1508 writer.write(documentContent);
1509 writer.flush();
1510 // do not close the writer, this would close the underlying stream,
1511 // which is something we want to leave to the caller
1512 }
1513
1514 /**
1515 * Save annotations to the given output stream in XCES format, with
1516 * their IDs included as the "n" attribute of each <code>struct</code>.
1517 * The stream is <i>not</i> closed by this method, that is left to
1518 * the caller.
1519 *
1520 * @param annotations the annotations to save, typically an
1521 * AnnotationSet
1522 * @param os the output stream to write to
1523 * @param encoding the character encoding to use.
1524 */
1525 public static void writeXcesAnnotations(Collection<Annotation> annotations,
1526 OutputStream os, String encoding) throws XMLStreamException {
1527 XMLStreamWriter xsw = null;
1528 try {
1529 if(outputFactory == null) {
1530 outputFactory = XMLOutputFactory.newInstance();
1531 }
1532 if(encoding == null) {
1533 xsw = outputFactory.createXMLStreamWriter(os);
1534 xsw.writeStartDocument();
1535 }
1536 else {
1537 xsw = outputFactory.createXMLStreamWriter(os, encoding);
1538 xsw.writeStartDocument(encoding, "1.0");
1539 }
1540 newLine(xsw);
1541 writeXcesAnnotations(annotations, xsw);
1542 }
1543 finally {
1544 if(xsw != null) {
1545 xsw.close();
1546 }
1547 }
1548 }
1549
1550 /**
1551 * Save annotations to the given XMLStreamWriter in XCES format, with
1552 * their IDs included as the "n" attribute of each <code>struct</code>.
1553 * The writer is <i>not</i> closed by this method, that is left to
1554 * the caller. This method writes just the cesAna element - the XML
1555 * declaration must be filled in by the caller if required.
1556 *
1557 * @param annotations the annotations to save, typically an
1558 * AnnotationSet
1559 * @param xsw the XMLStreamWriter to write to
1560 */
1561 public static void writeXcesAnnotations(Collection<Annotation> annotations,
1562 XMLStreamWriter xsw) throws XMLStreamException {
1563 writeXcesAnnotations(annotations, xsw, true);
1564 }
1565
1566 /**
1567 * Save annotations to the given XMLStreamWriter in XCES format. The
1568 * writer is <i>not</i> closed by this method, that is left to the
1569 * caller. This method writes just the cesAna element - the XML
1570 * declaration must be filled in by the caller if required. Characters
1571 * in feature values that are illegal in XML are replaced by
1572 * {@link #INVALID_CHARACTER_REPLACEMENT} (a space). Feature <i>names</i>
1573 * are not modified, nor are annotation types - an illegal character
1574 * in one of these will cause the serialization to fail.
1575 *
1576 * @param annotations the annotations to save, typically an
1577 * AnnotationSet
1578 * @param xsw the XMLStreamWriter to write to
1579 * @param includeId should we include the annotation IDs (as the "n"
1580 * attribute on each <code>struct</code>)?
1581 * @throws XMLStreamException
1582 */
1583 public static void writeXcesAnnotations(Collection<Annotation> annotations,
1584 XMLStreamWriter xsw, boolean includeId) throws XMLStreamException {
1585 List<Annotation> annotsToDump = new ArrayList<Annotation>(annotations);
1586 Collections.sort(annotsToDump, LONGEST_FIRST_OFFSET_COMPARATOR);
1587
1588 xsw.setDefaultNamespace(XCES_NAMESPACE);
1589 xsw.writeStartElement(XCES_NAMESPACE, "cesAna");
1590 xsw.writeDefaultNamespace(XCES_NAMESPACE);
1591 xsw.writeAttribute("version", XCES_VERSION);
1592 newLine(xsw);
1593
1594 String indent = " ";
1595 String indentMore = indent + indent;
1596
1597 for(Annotation a : annotsToDump) {
1598 long start = a.getStartNode().getOffset().longValue();
1599 long end = a.getEndNode().getOffset().longValue();
1600 FeatureMap fm = a.getFeatures();
1601 xsw.writeCharacters(indent);
1602 if(fm == null || fm.size() == 0) {
1603 xsw.writeEmptyElement(XCES_NAMESPACE, "struct");
1604 }
1605 else {
1606 xsw.writeStartElement(XCES_NAMESPACE, "struct");
1607 }
1608 xsw.writeAttribute("type", a.getType());
1609 xsw.writeAttribute("from", String.valueOf(start));
1610 xsw.writeAttribute("to", String.valueOf(end));
1611 // include the annotation ID as the "n" attribute if requested
1612 if(includeId) {
1613 xsw.writeAttribute("n", String.valueOf(a.getId()));
1614 }
1615 newLine(xsw);
1616
1617 if(fm != null && fm.size() != 0) {
1618 for(Map.Entry att : fm.entrySet()) {
1619 if(!"isEmptyAndSpan".equals(att.getKey())) {
1620 xsw.writeCharacters(indentMore);
1621 xsw.writeEmptyElement(XCES_NAMESPACE, "feat");
1622 xsw.writeAttribute("name", String.valueOf(att.getKey()));
1623 xsw.writeAttribute("value",
1624 replaceXMLIllegalCharactersInString(String.valueOf(att
1625 .getValue())));
1626 newLine(xsw);
1627 }
1628 }
1629 xsw.writeCharacters(indent);
1630 xsw.writeEndElement();
1631 newLine(xsw);
1632 }
1633 }
1634
1635 xsw.writeEndElement();
1636 newLine(xsw);
1637 }
1638
1639 /** An inner class modeling the information contained by an annotation. */
1640 static class AnnotationObject {
1641 /** Constructor */
1642 public AnnotationObject() {
1643 }// AnnotationObject
1644
1645 /** Accesor for the annotation type modeled here as ElemName */
1646 public String getElemName() {
1647 return elemName;
1648 }// getElemName
1649
1650 /** Accesor for the feature map */
1651 public FeatureMap getFM() {
1652 return fm;
1653 }// getFM()
1654
1655 /** Accesor for the start ofset */
1656 public Long getStart() {
1657 return start;
1658 }// getStart()
1659
1660 /** Accesor for the end offset */
1661 public Long getEnd() {
1662 return end;
1663 }// getEnd()
1664
1665 /** Mutator for the annotation type */
1666 public void setElemName(String anElemName) {
1667 elemName = anElemName;
1668 }// setElemName();
1669
1670 /** Mutator for the feature map */
1671 public void setFM(FeatureMap aFm) {
1672 fm = aFm;
1673 }// setFM();
1674
1675 /** Mutator for the start offset */
1676 public void setStart(Long aStart) {
1677 start = aStart;
1678 }// setStart();
1679
1680 /** Mutator for the end offset */
1681 public void setEnd(Long anEnd) {
1682 end = anEnd;
1683 }// setEnd();
1684
1685 /** Accesor for the id */
1686 public Integer getId() {
1687 return id;
1688 }// End of getId()
1689
1690 /** Mutator for the id */
1691 public void setId(Integer anId) {
1692 id = anId;
1693 }// End of setId()
1694
1695 public String toString() {
1696 return " [id =" + id + " type=" + elemName + " startNode=" + start
1697 + " endNode=" + end + " features=" + fm + "] ";
1698 }
1699
1700 // Data fields
1701 private String elemName = null;
1702
1703 private FeatureMap fm = null;
1704
1705 private Long start = null;
1706
1707 private Long end = null;
1708
1709 private Integer id = null;
1710 } // AnnotationObject
1711
1712 /**
1713 * Thin wrapper class to use a char[] as a CharSequence. The array is
1714 * not copied - changes to the array are reflected by the CharSequence
1715 * methods.
1716 */
1717 static class ArrayCharSequence implements CharSequence {
1718 char[] array;
1719
1720 ArrayCharSequence(char[] array) {
1721 this.array = array;
1722 }
1723
1724 public final char charAt(int i) {
1725 return array[i];
1726 }
1727
1728 public final int length() {
1729 return array.length;
1730 }
1731
1732 public CharSequence subSequence(int start, int end) {
1733 throw new UnsupportedOperationException("subSequence not implemented");
1734 }
1735
1736 public String toString() {
1737 return String.valueOf(array);
1738 }
1739 } // ArrayCharSequence
1740 }
|