0001 /*
0002 * GateFormatXmlDocumentHandler.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Cristian URSU, 22 Nov 2000
0013 *
0014 * $Id: GateFormatXmlDocumentHandler.java 12006 2009-12-01 17:24:28Z thomas_heitz $
0015 */
0016
0017 package gate.xml;
0018
0019 import java.lang.reflect.Constructor;
0020 import java.util.*;
0021
0022 import org.xml.sax.*;
0023 import org.xml.sax.helpers.DefaultHandler;
0024
0025 import gate.*;
0026 import gate.corpora.DocumentContentImpl;
0027 import gate.corpora.DocumentImpl;
0028 import gate.event.StatusListener;
0029 import gate.util.*;
0030
0031 /**
0032 * Implements the behaviour of the XML reader. This is the reader for
0033 * Gate Xml documents saved with DocumentImplementation.toXml() method.
0034 *
0035 * @deprecated GATE format XML documents are now handled by
0036 * {@link gate.corpora.DocumentStaxUtils}.
0037 */
0038 public class GateFormatXmlDocumentHandler extends DefaultHandler {
0039 /** Debug flag */
0040 private static final boolean DEBUG = false;
0041
0042 /**
0043 * This is used to capture all data within two tags before calling the
0044 * actual characters method
0045 */
0046 private StringBuffer contentBuffer = new StringBuffer("");
0047
0048 /** This is a variable that shows if characters have been read */
0049 private boolean readCharacterStatus = false;
0050
0051 /**
0052 * An OLD GATE XML format is the one in which Annotations IDs are not
0053 * present
0054 */
0055 private static final int OLD = 1;
0056
0057 /**
0058 * A NEW GATE XML format is the one in which Annotations IDs are
0059 * present
0060 */
0061 private static final int NEW = 2;
0062
0063 /**
0064 * This value signifies that the document being read can be either OLD
0065 * or NEW
0066 */
0067 private static final int UNDEFINED = 0;
0068
0069 /**
0070 * In the beginning we don't know the type of GATE XML format that we
0071 * read. We need to be able to read both types, but not a mixture of
0072 * them
0073 */
0074 private int gateXmlFormatType = UNDEFINED;
0075
0076 /**
0077 * A Set recording every annotation ID read from the XML file. It is
0078 * used to check the consistency of the annotations being read. At the
0079 * end we need the maximum ID in order to set the annotation ID
0080 * generator on the document. This is why we need a TreeSet.
0081 */
0082 private TreeSet annotationIdSet = new TreeSet();
0083
0084 /*********************************************************************
0085 * Instead of creating a new Class object for every Feature object we
0086 * store them in a map with a String as a key.
0087 ********************************************************************/
0088 private Map classCache = new HashMap();
0089
0090 /**
0091 */
0092 public GateFormatXmlDocumentHandler(gate.Document aDocument) {
0093 // This string contains the plain text (the text without markup)
0094 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
0095
0096 // Colector is used later to transform all custom objects into
0097 // annotation
0098 // objects
0099 colector = new LinkedList();
0100
0101 // The Gate document
0102 doc = aDocument;
0103 currentAnnotationSet = doc.getAnnotations();
0104 }// GateFormatXmlDocumentHandler
0105
0106 /**
0107 * This method is called when the SAX parser encounts the beginning of
0108 * the XML document.
0109 */
0110 public void startDocument() throws org.xml.sax.SAXException {
0111 }// startDocument
0112
0113 /**
0114 * This method is called when the SAX parser encounts the end of the
0115 * XML document. Here we set the content of the gate Document to be
0116 * the one generated inside this class (tmpDocContent). After that we
0117 * use the colector to generate all the annotation reffering this new
0118 * gate document.
0119 */
0120 public void endDocument() throws org.xml.sax.SAXException {
0121
0122 // replace the document content with the one without markups
0123 doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
0124 // long docSize = doc.getContent().size().longValue();
0125
0126 // If annotations were present in the NEW GATE XML document format,
0127 // set the document generator to start from th next MAX Annot ID
0128 // value
0129 if(gateXmlFormatType == NEW && !annotationIdSet.isEmpty()) {
0130 // Because annotationIdSet is a TreeSet its elements are already
0131 // sorted.
0132 // The last element will contain the maximum value
0133 Integer maxAnnotID = (Integer)annotationIdSet.last();
0134 // Set the document generator to start from the maxAnnotID value
0135 ((DocumentImpl)doc).setNextAnnotationId(maxAnnotID.intValue() + 1);
0136 // Dispose of the annotationIdSet
0137 annotationIdSet = null;
0138 }// fi
0139
0140 // fire the status listener
0141 fireStatusChangedEvent("Total elements: " + elements);
0142
0143 }// endDocument
0144
0145 /**
0146 * This method is called when the SAX parser encounts the beginning of
0147 * an XML element.
0148 */
0149 public void startElement(String uri, String qName, String elemName,
0150 Attributes atts) throws SAXException {
0151
0152 // call characterActions
0153 if(readCharacterStatus) {
0154 readCharacterStatus = false;
0155 charactersAction(new String(contentBuffer).toCharArray(), 0,
0156 contentBuffer.length());
0157 }
0158
0159 // Inform the progress listener to fire only if no of elements
0160 // processed
0161 // so far is a multiple of ELEMENTS_RATE
0162 if((++elements % ELEMENTS_RATE) == 0)
0163 fireStatusChangedEvent("Processed elements : " + elements);
0164
0165 // Set the curent element being processed
0166 currentElementStack.add(elemName);
0167
0168 if("AnnotationSet".equals(elemName)) processAnnotationSetElement(atts);
0169
0170 if("Annotation".equals(elemName)) processAnnotationElement(atts);
0171
0172 if("Feature".equals(elemName)) processFeatureElement(atts);
0173
0174 if("Name".equals(elemName)) processNameElement(atts);
0175
0176 if("Value".equals(elemName)) processValueElement(atts);
0177
0178 if("Node".equals(elemName)) processNodeElement(atts);
0179 }// startElement
0180
0181 /**
0182 * This method is called when the SAX parser encounts the end of an
0183 * XML element.
0184 */
0185 public void endElement(String uri, String qName, String elemName)
0186 throws SAXException {
0187
0188 // call characterActions
0189 if(readCharacterStatus) {
0190 readCharacterStatus = false;
0191 charactersAction(new String(contentBuffer).toCharArray(), 0,
0192 contentBuffer.length());
0193 }
0194
0195 currentElementStack.pop();
0196 // Deal with Annotation
0197 if("Annotation".equals(elemName)) {
0198 if(currentFeatureMap == null)
0199 currentFeatureMap = Factory.newFeatureMap();
0200 currentAnnot.setFM(currentFeatureMap);
0201 colector.add(currentAnnot);
0202 // Reset current Annot and current featue map
0203 currentAnnot = null;
0204 currentFeatureMap = null;
0205 return;
0206 }// End if
0207 // Deal with Value
0208 if("Value".equals(elemName)
0209 && "Feature".equals((String)currentElementStack.peek())) {
0210 // If the Value tag was empty, then an empty string will be
0211 // created.
0212 if(currentFeatureValue == null) currentFeatureValue = "";
0213 }// End if
0214 // Deal with Feature
0215 if("Feature".equals(elemName)) {
0216 if(currentFeatureName == null) {
0217 // Cannot add the (key,value) pair to the map
0218 // One of them is null something was wrong in the XML file.
0219 throw new GateSaxException(
0220 "A feature name was empty."
0221 + "The annotation that cause it is "
0222 + currentAnnot
0223 + ".Please check the document with a text editor before trying again.");
0224 }
0225 else {
0226 if(currentFeatureMap == null) {
0227 // The XMl file was somehow altered and a start Feature wasn't
0228 // found.
0229 throw new GateSaxException(
0230 "Document not consistent. A start"
0231 + " feature element is missing. "
0232 + "The annotation that cause it is "
0233 + currentAnnot
0234 + "Please check the document with a text editor before trying again.");
0235 }// End if
0236 // Create the appropiate feature name and values
0237 // If those object cannot be created, their string
0238 // representation will
0239 // be used.
0240 currentFeatureMap.put(createFeatKey(), createFeatValue());
0241 // currentFeatureMap.put(currentFeatureName,currentFeatureValue);
0242 // Reset current key
0243 currentFeatureKeyClassName = null;
0244 currentFeatureKeyItemClassName = null;
0245 currentFeatureName = null;
0246 // Reset current value
0247 currentFeatureValueClassName = null;
0248 currentFeatureValueItemClassName = null;
0249 currentFeatureValue = null;
0250 }// End if
0251 // Reset the Name & Value pair.
0252 currentFeatureName = null;
0253 currentFeatureValue = null;
0254 return;
0255 }// End if
0256 // Deal GateDocumentFeatures
0257 if("GateDocumentFeatures".equals(elemName)) {
0258 if(currentFeatureMap == null)
0259 currentFeatureMap = Factory.newFeatureMap();
0260 doc.setFeatures(currentFeatureMap);
0261 currentFeatureMap = null;
0262 return;
0263 }// End if
0264
0265 // Deal with AnnotationSet
0266 if("AnnotationSet".equals(elemName)) {
0267 // Create and add annotations to the currentAnnotationSet
0268 Iterator iterator = colector.iterator();
0269 while(iterator.hasNext()) {
0270 AnnotationObject annot = (AnnotationObject)iterator.next();
0271 // Clear the annot from the colector
0272 iterator.remove();
0273
0274 // Create a new annotation and add it to the annotation set
0275 try {
0276
0277 // This is the result of a code-fix.The XML writter has been
0278 // modified
0279 // to serialize the annotation ID.In order to keep backward
0280 // compatibility
0281 // with previously saved documents we had to keep the old
0282 // code(where the id
0283 // is not added) in place.
0284 // If the document presents a mixture of the two formats, then
0285 // error is signaled
0286
0287 // Check if the Annotation ID is present or not
0288 if(annot.getId() == null) {
0289 // Annotation without ID. We assume the OLD format.
0290
0291 // If we previously detected a NEW format, then we have a
0292 // mixture of the two
0293 if(gateXmlFormatType == NEW)
0294 // Signal the error to the user
0295 throw new GateSaxException(
0296 "Found an annotation without ID while "
0297 + "previous annotations had one."
0298 + "The NEW GATE XML document format requires"
0299 + " all annotations to have an UNIQUE ID."
0300 + " The offending annotation was of [type="
0301 + annot.getElemName() + ", startOffset="
0302 + annot.getStart() + ", endOffset="
0303 + annot.getEnd() + "]");
0304
0305 // We are reading OLD format document
0306 gateXmlFormatType = OLD;
0307 currentAnnotationSet.add(annot.getStart(), annot.getEnd(), annot
0308 .getElemName(), annot.getFM());
0309 }
0310 else {
0311 // Annotation with ID. We assume the NEW format
0312
0313 // If we previously detected an OLD format, then it means we
0314 // have a mixture of the two
0315 if(gateXmlFormatType == OLD)
0316 // Signal the error to the user
0317 throw new GateSaxException(
0318 "Found an annotation with ID while "
0319 + "previous annotations didn't have one."
0320 + "The OLD GATE XML"
0321 + "document format requires all annotations NOT to have an ID."
0322 + " The offending annotation was of [Id="
0323 + annot.getId() + ", type=" + annot.getElemName()
0324 + ", startOffset=" + annot.getStart()
0325 + ", endOffset=" + annot.getEnd() + "]");
0326
0327 gateXmlFormatType = NEW;
0328 // Test for the unicity of the annotation ID being used
0329 // If the ID is not Unique, the method will throw an
0330 // exception
0331 testAnnotationIdUnicity(annot.getId());
0332
0333 // Add the annotation
0334 currentAnnotationSet.add(annot.getId(), annot.getStart(), annot
0335 .getEnd(), annot.getElemName(), annot.getFM());
0336 }
0337 }
0338 catch(gate.util.InvalidOffsetException e) {
0339 throw new GateSaxException(e);
0340 }// End try
0341 }// End while
0342 // The colector is empty and ready for the next AnnotationSet
0343 return;
0344 }// End if
0345
0346 }// endElement
0347
0348 /**
0349 * This method is called when the SAX parser encounts text in the XML
0350 * doc. Here we calculate the end indices for all the elements present
0351 * inside the stack and update with the new values.
0352 */
0353 public void characters(char[] text, int start, int length)
0354 throws SAXException {
0355 if(!readCharacterStatus) {
0356 contentBuffer = new StringBuffer(new String(text, start, length));
0357 }
0358 else {
0359 contentBuffer.append(new String(text, start, length));
0360 }
0361 readCharacterStatus = true;
0362 }
0363
0364 /**
0365 * This method is called when all characters between specific tags
0366 * have been read completely
0367 */
0368 public void charactersAction(char[] text, int start, int length)
0369 throws SAXException {
0370 // Create a string object based on the reported text
0371 String content = new String(text, start, length);
0372 if("TextWithNodes".equals((String)currentElementStack.peek())) {
0373 processTextOfTextWithNodesElement(content);
0374 return;
0375 }// End if
0376 if("Name".equals((String)currentElementStack.peek())) {
0377 processTextOfNameElement(content);
0378 return;
0379 }// End if
0380 if("Value".equals((String)currentElementStack.peek())) {
0381 // if (currentFeatureName != null &&
0382 // "string".equals(currentFeatureName) &&
0383 // currentAnnot!= null &&
0384 // "Token".equals(currentAnnot.getElemName()) &&
0385 // currentAnnot.getEnd().longValue() == 1063)
0386 // System.out.println("Content=" + content + " start="+ start + "
0387 // length=" + length);
0388 processTextOfValueElement(content);
0389 return;
0390 }// End if
0391 }// characters
0392
0393 /**
0394 * This method is called when the SAX parser encounts white spaces
0395 */
0396 public void ignorableWhitespace(char ch[], int start, int length)
0397 throws SAXException {
0398 }// ignorableWhitespace
0399
0400 /**
0401 * Error method.We deal with this exception inside SimpleErrorHandler
0402 * class
0403 */
0404 public void error(SAXParseException ex) throws SAXException {
0405 // deal with a SAXParseException
0406 // see SimpleErrorhandler class
0407 _seh.error(ex);
0408 }// error
0409
0410 /**
0411 * FatalError method.
0412 */
0413 public void fatalError(SAXParseException ex) throws SAXException {
0414 // deal with a SAXParseException
0415 // see SimpleErrorhandler class
0416 _seh.fatalError(ex);
0417 }// fatalError
0418
0419 /**
0420 * Warning method comment.
0421 */
0422 public void warning(SAXParseException ex) throws SAXException {
0423 // deal with a SAXParseException
0424 // see SimpleErrorhandler class
0425 _seh.warning(ex);
0426 }// warning
0427
0428 // Custom methods section
0429
0430 /** This method deals with a AnnotationSet element. */
0431 private void processAnnotationSetElement(Attributes atts) {
0432 if(atts != null) {
0433 for(int i = 0; i < atts.getLength(); i++) {
0434 // Extract name and value
0435 String attName = atts.getLocalName(i);
0436 String attValue = atts.getValue(i);
0437 if("Name".equals(attName))
0438 currentAnnotationSet = doc.getAnnotations(attValue);
0439 }// End for
0440 }// End if
0441 }// processAnnotationSetElement
0442
0443 /** This method deals with the start of a Name element */
0444 private void processNameElement(Attributes atts) {
0445 if(atts == null) return;
0446 currentFeatureKeyClassName = atts.getValue("className");
0447 currentFeatureKeyItemClassName = atts.getValue("itemClassName");
0448 }// End processNameElement();
0449
0450 /** This method deals with the start of a Value element */
0451 private void processValueElement(Attributes atts) {
0452 if(atts == null) return;
0453 currentFeatureValueClassName = atts.getValue("className");
0454 currentFeatureValueItemClassName = atts.getValue("itemClassName");
0455 }// End processValueElement();
0456
0457 /** This method deals with a Annotation element. */
0458 private void processAnnotationElement(Attributes atts) {
0459 if(atts != null) {
0460 currentAnnot = new AnnotationObject();
0461 for(int i = 0; i < atts.getLength(); i++) {
0462 // Extract name and value
0463 String attName = atts.getLocalName(i);
0464 String attValue = atts.getValue(i);
0465
0466 if("Id".equals(attName)) currentAnnot.setId(new Integer(attValue));
0467
0468 if("Type".equals(attName)) currentAnnot.setElemName(attValue);
0469
0470 try {
0471 if("StartNode".equals(attName)) {
0472 Integer id = new Integer(attValue);
0473 Long offset = (Long)id2Offset.get(id);
0474 if(offset == null) {
0475 throw new GateRuntimeException("Couldn't found Node with id = "
0476 + id + ".It was specified in annot " + currentAnnot
0477 + " as a start node!"
0478 + "Check the document with a text editor or something"
0479 + " before trying again.");
0480
0481 }
0482 else currentAnnot.setStart(offset);
0483 }// Endif
0484 if("EndNode".equals(attName)) {
0485 Integer id = new Integer(attValue);
0486 Long offset = (Long)id2Offset.get(id);
0487 if(offset == null) {
0488 throw new GateRuntimeException("Couldn't found Node with id = "
0489 + id + ".It was specified in annot " + currentAnnot
0490 + " as a end node!"
0491 + "Check the document with a text editor or something"
0492 + " before trying again.");
0493 }
0494 else currentAnnot.setEnd(offset);
0495 }// End if
0496 }
0497 catch(NumberFormatException e) {
0498 throw new GateRuntimeException("Offsets problems.Couldn't create"
0499 + " Integers from" + " id[" + attValue + "]) in annot "
0500 + currentAnnot
0501 + "Check the document with a text editor or something,"
0502 + " before trying again");
0503 }// End try
0504 }// End For
0505 }// End if
0506 }// processAnnotationElement
0507
0508 /** This method deals with a Features element. */
0509 private void processFeatureElement(Attributes atts) {
0510 // The first time feature is calle it will create a features map.
0511 if(currentFeatureMap == null) currentFeatureMap = Factory.newFeatureMap();
0512 }// processFeatureElement
0513
0514 /** This method deals with a Node element. */
0515 private void processNodeElement(Attributes atts) {
0516 if(atts != null) {
0517 for(int i = 0; i < atts.getLength(); i++) {
0518 // Extract name and value
0519 String attName = atts.getLocalName(i);
0520 String attValue = atts.getValue(i);
0521 // System.out.println("Node : " + attName + "=" +attValue);
0522 if("id".equals(attName)) {
0523 try {
0524 Integer id = new Integer(attValue);
0525 id2Offset.put(id, new Long(tmpDocContent.length()));
0526 }
0527 catch(NumberFormatException e) {
0528 throw new GateRuntimeException("Coudn't create a node from "
0529 + attValue + " Expected an integer.");
0530 }// End try
0531 }// End if
0532 }// End for
0533 }// End if
0534 }// processNodeElement();
0535
0536 /** This method deals with a Text belonging to TextWithNodes element. */
0537 private void processTextOfTextWithNodesElement(String text) {
0538 text = recoverNewLineSequence(text);
0539 tmpDocContent.append(text);
0540 }// processTextOfTextWithNodesElement
0541
0542 /** Restore new line as in the original document if needed */
0543 private String recoverNewLineSequence(String text) {
0544 String result = text;
0545
0546 // check for new line
0547 if(text.indexOf('\n') != -1) {
0548 String newLineType = (String)doc.getFeatures().get(
0549 GateConstants.DOCUMENT_NEW_LINE_TYPE);
0550
0551 if("LF".equalsIgnoreCase(newLineType)) {
0552 newLineType = null;
0553 }
0554
0555 // exit with the same text if the change isn't necessary
0556 if(newLineType == null) return result;
0557
0558 String newLine = "\n";
0559 if("CRLF".equalsIgnoreCase(newLineType)) {
0560 newLine = "\r\n";
0561 }
0562 if("CR".equalsIgnoreCase(newLineType)) {
0563 newLine = "\r";
0564 }
0565 if("LFCR".equalsIgnoreCase(newLineType)) {
0566 newLine = "\n\r";
0567 }
0568
0569 StringBuffer buff = new StringBuffer(text);
0570 int index = text.lastIndexOf('\n');
0571 while(index != -1) {
0572 buff.replace(index, index + 1, newLine);
0573 index = text.lastIndexOf('\n', index - 1);
0574 } // while
0575 result = buff.toString();
0576 } // if
0577
0578 return result;
0579 } // recoverNewLineSequence(String text)
0580
0581 /** This method deals with a Text belonging to Name element. */
0582 private void processTextOfNameElement(String text) throws GateSaxException {
0583 if(currentFeatureMap == null)
0584 throw new GateSaxException(
0585 "GATE xml format processing error:"
0586 + " Found a Name element that is not enclosed into a Feature one while"
0587 + " analyzing the annotation "
0588 + currentAnnot
0589 + "Please check the document with a text editor or something before"
0590 + " trying again.");
0591 else {
0592 // In the entities case, characters() gets called separately for
0593 // each
0594 // entity so the text needs to be appended.
0595 if(currentFeatureName == null)
0596 currentFeatureName = text;
0597 else currentFeatureName = currentFeatureName + text;
0598 }// End If
0599 }// processTextOfNameElement();
0600
0601 /** This method deals with a Text belonging to Value element. */
0602 private void processTextOfValueElement(String text) throws GateSaxException {
0603 if(currentFeatureMap == null)
0604 throw new GateSaxException(
0605 "GATE xml format processing error:"
0606 + " Found a Value element that is not enclosed into a Feature one while"
0607 + " analyzing the annotation "
0608 + currentAnnot
0609 + "Please check the document with a text editor or something before"
0610 + " trying again.");
0611 else {
0612 // In the entities case, characters() gets called separately for
0613 // each
0614 // entity so the text needs to be appended.
0615 if(currentFeatureValue == null)
0616 currentFeatureValue = text;
0617 else currentFeatureValue = currentFeatureValue + text;
0618 }// End If
0619 }// processTextOfValueElement();
0620
0621 /**
0622 * Creates a feature key using this information:
0623 * currentFeatureKeyClassName, currentFeatureKeyItemClassName,
0624 * currentFeatureName. See createFeatObject() method for more details.
0625 */
0626 private Object createFeatKey() {
0627 return createFeatObject(currentFeatureKeyClassName,
0628 currentFeatureKeyItemClassName, currentFeatureName);
0629 }// createFeatKey()
0630
0631 /**
0632 * Creates a feature value using this information:
0633 * currentFeatureValueClassName, currentFeatureValueItemClassName,
0634 * currentFeatureValue. See createFeatObject() method for more
0635 * details.
0636 */
0637 private Object createFeatValue() {
0638 return createFeatObject(currentFeatureValueClassName,
0639 currentFeatureValueItemClassName, currentFeatureValue);
0640 }// createFeatValue()
0641
0642 /**
0643 * This method tries to reconstruct an object given its class name and
0644 * its string representation. If the object is a Collection then the
0645 * items from its string representation must be separated by a ";". In
0646 * that case, the currentFeatureValueItemClassName is used to create
0647 * items belonging to this class.
0648 *
0649 * @param aFeatClassName represents the name of the class of the feat
0650 * object being created. If it is null then the
0651 * javaLang.String will be used as default.
0652 * @param aFeatItemClassName is it used only if aFeatClassName is a
0653 * collection.If it is null then java.lang.String will be
0654 * used as default;
0655 * @param aFeatStringRepresentation sais it all
0656 * @return an Object created from aFeatClassName and its
0657 * aFeatStringRepresentation. If not possible, then
0658 * aFeatStringRepresentation is returned.
0659 * @throws GateRuntimeException If it can't create an item, that does
0660 * not comply with its class definition, to add to the
0661 * collection.
0662 */
0663 private Object createFeatObject(String aFeatClassName,
0664 String aFeatItemClassName, String aFeatStringRepresentation) {
0665 // If the string rep is null then the object will be null;
0666 if(aFeatStringRepresentation == null) return null;
0667 if(aFeatClassName == null) aFeatClassName = "java.lang.String";
0668 if(aFeatItemClassName == null) aFeatItemClassName = "java.lang.String";
0669 Class currentFeatClass = null;
0670 // look in the cache for existing
0671 // Class objects instead of recreating them
0672 currentFeatClass = (Class)classCache.get(aFeatClassName);
0673 if(currentFeatClass == null) {
0674 try {
0675 currentFeatClass = Gate.getClassLoader().loadClass(aFeatClassName);
0676 }
0677 catch(ClassNotFoundException cnfex) {
0678 return aFeatStringRepresentation;
0679 }// End try
0680 classCache.put(aFeatClassName, currentFeatClass);
0681 }
0682 if(java.util.Collection.class.isAssignableFrom(currentFeatClass)) {
0683 Class itemClass = null;
0684 Collection featObject = null;
0685 try {
0686 featObject = (Collection)currentFeatClass.newInstance();
0687 try {
0688 itemClass = Gate.getClassLoader().loadClass(aFeatItemClassName);
0689 }
0690 catch(ClassNotFoundException cnfex) {
0691 Out
0692 .prln("Warning: Item class " + aFeatItemClassName
0693 + " not found."
0694 + "Adding items as Strings to the feature called \""
0695 + currentFeatureName + "\" in the annotation "
0696 + currentAnnot);
0697 itemClass = java.lang.String.class;
0698 }// End try
0699 // Let's detect if itemClass takes a constructor with a String
0700 // as param
0701 Class[] paramsArray = new Class[1];
0702 paramsArray[0] = java.lang.String.class;
0703 Constructor itemConstructor = null;
0704 boolean addItemAsString = false;
0705 try {
0706 itemConstructor = itemClass.getConstructor(paramsArray);
0707 }
0708 catch(NoSuchMethodException nsme) {
0709 addItemAsString = true;
0710 }
0711 catch(SecurityException se) {
0712 addItemAsString = true;
0713 }// End try
0714 StringTokenizer strTok = new StringTokenizer(aFeatStringRepresentation,
0715 ";");
0716 Object[] params = new Object[1];
0717 Object itemObj = null;
0718 while(strTok.hasMoreTokens()) {
0719 String itemStrRep = strTok.nextToken();
0720 if(addItemAsString)
0721 featObject.add(itemStrRep);
0722 else {
0723 params[0] = itemStrRep;
0724 try {
0725 itemObj = itemConstructor.newInstance(params);
0726 }
0727 catch(Exception e) {
0728 throw new GateRuntimeException("An item(" + itemStrRep
0729 + ") does not comply with its class" + " definition("
0730 + aFeatItemClassName + ").Happened while tried to"
0731 + " add feature: " + aFeatStringRepresentation
0732 + " to the annotation " + currentAnnot);
0733 }// End try
0734 featObject.add(itemObj);
0735 }// End if
0736 }// End while
0737 }
0738 catch(InstantiationException instex) {
0739 return aFeatStringRepresentation;
0740 }
0741 catch(IllegalAccessException iae) {
0742 return aFeatStringRepresentation;
0743 }// End try
0744 return featObject;
0745 }// End if
0746 // If currentfeatClass is not a Collection,test to see if
0747 // it has a constructor that takes a String as param
0748 Class[] params = new Class[1];
0749 params[0] = java.lang.String.class;
0750 try {
0751 Constructor featConstr = currentFeatClass.getConstructor(params);
0752 Object[] featConstrParams = new Object[1];
0753 featConstrParams[0] = aFeatStringRepresentation;
0754 Object featObject = featConstr.newInstance(featConstrParams);
0755 return featObject;
0756 }
0757 catch(Exception e) {
0758 return aFeatStringRepresentation;
0759 }// End try
0760 }// createFeatObject()
0761
0762 /**
0763 * This method tests if the Annotation ID has been used previously (in
0764 * which case will rase an exception) and also adds the ID being
0765 * tested to the annotationIdSet
0766 *
0767 * @param anAnnotId An Integer representing an annotation ID to be
0768 * tested
0769 * @throws GateSaxException if there is already an annotation wit the
0770 * same ID
0771 */
0772 private void testAnnotationIdUnicity(Integer anAnnotId)
0773 throws GateSaxException {
0774
0775 if(annotationIdSet.contains(anAnnotId))
0776 throw new GateSaxException("Found two or possibly more annotations with"
0777 + " the same ID! The offending ID was " + anAnnotId);
0778 else annotationIdSet.add(anAnnotId);
0779 }// End of testAnnotationIdUnicity()
0780
0781 /**
0782 * This method is called when the SAX parser encounts a comment It
0783 * works only if the XmlDocumentHandler implements a
0784 * com.sun.parser.LexicalEventListener
0785 */
0786 public void comment(String text) throws SAXException {
0787 }// comment
0788
0789 /**
0790 * This method is called when the SAX parser encounts a start of a
0791 * CDATA section It works only if the XmlDocumentHandler implements a
0792 * com.sun.parser.LexicalEventListener
0793 */
0794 public void startCDATA() throws SAXException {
0795 }// startCDATA
0796
0797 /**
0798 * This method is called when the SAX parser encounts the end of a
0799 * CDATA section. It works only if the XmlDocumentHandler implements a
0800 * com.sun.parser.LexicalEventListener
0801 */
0802 public void endCDATA() throws SAXException {
0803 }// endCDATA
0804
0805 /**
0806 * This method is called when the SAX parser encounts a parsed Entity
0807 * It works only if the XmlDocumentHandler implements a
0808 * com.sun.parser.LexicalEventListener
0809 */
0810 public void startParsedEntity(String name) throws SAXException {
0811 }// startParsedEntity
0812
0813 /**
0814 * This method is called when the SAX parser encounts a parsed entity
0815 * and informs the application if that entity was parsed or not It's
0816 * working only if the CustomDocumentHandler implements a
0817 * com.sun.parser.LexicalEventListener
0818 */
0819 public void endParsedEntity(String name, boolean included)
0820 throws SAXException {
0821 }// endParsedEntity
0822
0823 // StatusReporter Implementation
0824
0825 /**
0826 * This methos is called when a listener is registered with this class
0827 */
0828 public void addStatusListener(StatusListener listener) {
0829 myStatusListeners.add(listener);
0830 }// addStatusListener
0831
0832 /**
0833 * This methos is called when a listener is removed
0834 */
0835 public void removeStatusListener(StatusListener listener) {
0836 myStatusListeners.remove(listener);
0837 }// removeStatusListener
0838
0839 /**
0840 * This methos is called whenever we need to inform the listener about
0841 * an event.
0842 */
0843 protected void fireStatusChangedEvent(String text) {
0844 Iterator listenersIter = myStatusListeners.iterator();
0845 while(listenersIter.hasNext())
0846 ((StatusListener)listenersIter.next()).statusChanged(text);
0847 }// fireStatusChangedEvent
0848
0849 // XmlDocumentHandler member data
0850
0851 /**
0852 * This constant indicates when to fire the status listener. This
0853 * listener will add an overhead and we don't want a big overhead. It
0854 * will be callled from ELEMENTS_RATE to ELEMENTS_RATE
0855 */
0856 final static int ELEMENTS_RATE = 128;
0857
0858 /** This object indicates what to do when the parser encounts an error */
0859 private SimpleErrorHandler _seh = new SimpleErrorHandler();
0860
0861 /** The content of the XML document, without any tag */
0862 private StringBuffer tmpDocContent = new StringBuffer("");
0863
0864 /** A gate document */
0865 private gate.Document doc = null;
0866
0867 /** Listeners for status report */
0868 protected List myStatusListeners = new LinkedList();
0869
0870 /**
0871 * This reports the the number of elements that have beed processed so
0872 * far
0873 */
0874 private int elements = 0;
0875
0876 /**
0877 * We need a colection to retain all the CustomObjects that will be
0878 * transformed into annotation over the gate document... At the end of
0879 * every annotation set read the objects in the colector are
0880 * transformed into annotations...
0881 */
0882 private List colector = null;
0883
0884 /**
0885 * Maps nodes Ids to their offset in the document text. Those offsets
0886 * will be used when creating annotations
0887 */
0888 private Map id2Offset = new TreeMap();
0889
0890 /** Holds the current element read. */
0891 private Stack currentElementStack = new Stack();
0892
0893 /**
0894 * This inner objects maps an annotation object. When an annotation
0895 * from the xml document was read this structure is filled out
0896 */
0897 private AnnotationObject currentAnnot = null;
0898
0899 /** A map holding current annotation's features */
0900 private FeatureMap currentFeatureMap = null;
0901
0902 /** A key of the current feature */
0903 private String currentFeatureName = null;
0904
0905 /** The value of the current feature */
0906 private String currentFeatureValue = null;
0907
0908 /** The class name of the key in the current feature */
0909 private String currentFeatureKeyClassName = null;
0910
0911 /**
0912 * If the key is a collection then we need to know the class name of
0913 * the items present in this collection. The next field holds just
0914 * that.
0915 */
0916 private String currentFeatureKeyItemClassName = null;
0917
0918 /** The class name for the value in the current feature */
0919 private String currentFeatureValueClassName = null;
0920
0921 /**
0922 * If the value is a collection then we need to know the class name of
0923 * the items present in this collection. The next field holds just
0924 * that.
0925 */
0926 private String currentFeatureValueItemClassName = null;
0927
0928 /**
0929 * the current annotation set that is being created and filled with
0930 * annotations
0931 */
0932 private AnnotationSet currentAnnotationSet = null;
0933
0934 /** An inner class modeling the information contained by an annotation. */
0935 class AnnotationObject {
0936 /** Constructor */
0937 public AnnotationObject() {
0938 }// AnnotationObject
0939
0940 /** Accesor for the annotation type modeled here as ElemName */
0941 public String getElemName() {
0942 return elemName;
0943 }// getElemName
0944
0945 /** Accesor for the feature map */
0946 public FeatureMap getFM() {
0947 return fm;
0948 }// getFM()
0949
0950 /** Accesor for the start ofset */
0951 public Long getStart() {
0952 return start;
0953 }// getStart()
0954
0955 /** Accesor for the end offset */
0956 public Long getEnd() {
0957 return end;
0958 }// getEnd()
0959
0960 /** Mutator for the annotation type */
0961 public void setElemName(String anElemName) {
0962 elemName = anElemName;
0963 }// setElemName();
0964
0965 /** Mutator for the feature map */
0966 public void setFM(FeatureMap aFm) {
0967 fm = aFm;
0968 }// setFM();
0969
0970 /** Mutator for the start offset */
0971 public void setStart(Long aStart) {
0972 start = aStart;
0973 }// setStart();
0974
0975 /** Mutator for the end offset */
0976 public void setEnd(Long anEnd) {
0977 end = anEnd;
0978 }// setEnd();
0979
0980 /** Accesor for the id */
0981 public Integer getId() {
0982 return id;
0983 }// End of getId()
0984
0985 /** Mutator for the id */
0986 public void setId(Integer anId) {
0987 id = anId;
0988 }// End of setId()
0989
0990 public String toString() {
0991 return " [id =" + id + " type=" + elemName + " startNode=" + start
0992 + " endNode=" + end + " features=" + fm + "] ";
0993 }
0994
0995 // Data fields
0996 private String elemName = null;
0997
0998 private FeatureMap fm = null;
0999
1000 private Long start = null;
1001
1002 private Long end = null;
1003
1004 private Integer id = null;
1005 } // AnnotationObject
1006 }// GateFormatXmlDocumentHandler
|