0001 /*
0002 * DocumentImpl.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Hamish Cunningham, 11/Feb/2000
0013 *
0014 * $Id: DocumentImpl.java 13628 2011-04-06 09:22:15Z philgooch $
0015 */
0016 package gate.corpora;
0017
0018 import java.io.IOException;
0019 import java.net.URL;
0020 import java.util.*;
0021 import gate.*;
0022 import gate.annotation.AnnotationSetImpl;
0023 import gate.creole.AbstractLanguageResource;
0024 import gate.creole.ResourceInstantiationException;
0025 import gate.creole.metadata.*;
0026 import gate.event.*;
0027 import gate.util.*;
0028
0029 /**
0030 * Represents the commonalities between all sorts of documents.
0031 *
0032 * <H2>Editing</H2>
0033 *
0034 * <P>
0035 * The DocumentImpl class implements the Document interface. The
0036 * DocumentContentImpl class models the textual or audio-visual materials which
0037 * are the source and content of Documents. The AnnotationSetImpl class supplies
0038 * annotations on Documents.
0039 *
0040 * <P>
0041 * Abbreviations:
0042 *
0043 * <UL>
0044 * <LI> DC = DocumentContent
0045 * <LI> D = Document
0046 * <LI> AS = AnnotationSet
0047 * </UL>
0048 *
0049 * <P>
0050 * We add an edit method to each of these classes; for DC and AS the methods are
0051 * package private; D has the public method.
0052 *
0053 * <PRE>
0054 *
0055 * void edit(Long start, Long end, DocumentContent replacement) throws
0056 * InvalidOffsetException;
0057 *
0058 * </PRE>
0059 *
0060 * <P>
0061 * D receives edit requests and forwards them to DC and AS. On DC, this method
0062 * makes a change to the content - e.g. replacing a String range from start to
0063 * end with replacement. (Deletions are catered for by having replacement =
0064 * null.) D then calls AS.edit on each of its annotation sets.
0065 *
0066 * <P>
0067 * On AS, edit calls replacement.size() (i.e. DC.size()) to figure out how long
0068 * the replacement is (0 for null). It then considers annotations that terminate
0069 * (start or end) in the altered or deleted range as invalid; annotations that
0070 * terminate after the range have their offsets adjusted. I.e.:
0071 * <UL>
0072 * <LI> the nodes that pointed inside the old modified area are invalid now and
0073 * will be deleted along with the connected annotations;
0074 * <LI> the nodes that are before the start of the modified area remain
0075 * untouched;
0076 * <LI> the nodes that are after the end of the affected area will have the
0077 * offset changed according to the formula below.
0078 * </UL>
0079 *
0080 * <P>
0081 * A note re. AS and annotations: annotations no longer have offsets as in the
0082 * old model, they now have nodes, and nodes have offsets.
0083 *
0084 * <P>
0085 * To implement AS.edit, we have several indices:
0086 *
0087 * <PRE>
0088 *
0089 * HashMap annotsByStartNode, annotsByEndNode;
0090 *
0091 * </PRE>
0092 *
0093 * which map node ids to annotations;
0094 *
0095 * <PRE>
0096 *
0097 * RBTreeMap nodesByOffset;
0098 *
0099 * </PRE>
0100 *
0101 * which maps offset to Nodes.
0102 *
0103 * <P>
0104 * When we get an edit request, we traverse that part of the nodesByOffset tree
0105 * representing the altered or deleted range of the DC. For each node found, we
0106 * delete any annotations that terminate on the node, and then delete the node
0107 * itself. We then traverse the rest of the tree, changing the offset on all
0108 * remaining nodes by:
0109 *
0110 * <PRE>
0111 *
0112 * newOffset = oldOffset - ( (end - start) - // size of mod ( (replacement ==
0113 * null) ? 0 : replacement.size() ) // size of repl );
0114 *
0115 * </PRE>
0116 *
0117 * Note that we use the same convention as e.g. java.lang.String: start offsets
0118 * are inclusive; end offsets are exclusive. I.e. for string "abcd" range 1-3 =
0119 * "bc". Examples, for a node with offset 4:
0120 *
0121 * <PRE>
0122 *
0123 * edit(1, 3, "BC"); newOffset = 4 - ( (3 - 1) - 2 ) = 4
0124 *
0125 * edit(1, 3, null); newOffset = 4 - ( (3 - 1) - 0 ) = 2
0126 *
0127 * edit(1, 3, "BBCC"); newOffset = 4 - ( (3 - 1) - 4 ) = 6
0128 *
0129 * </PRE>
0130 */
0131 @CreoleResource(name = "GATE Document", interfaceName = "gate.Document",
0132 comment = "GATE transient document.", icon = "document",
0133 helpURL = "http://gate.ac.uk/userguide/sec:developer:documents")
0134 public class DocumentImpl extends AbstractLanguageResource implements
0135 TextualDocument,
0136 CreoleListener,
0137 DatastoreListener {
0138 /** Debug flag */
0139 private static final boolean DEBUG = false;
0140
0141 /**
0142 * If you set this flag to true the original content of the document will be
0143 * kept in the document feature. <br>
0144 * Default value is false to avoid the unnecessary waste of memory
0145 */
0146 private Boolean preserveOriginalContent = new Boolean(false);
0147
0148 /**
0149 * If you set this flag to true the repositioning information for the document
0150 * will be kept in the document feature. <br>
0151 * Default value is false to avoid the unnecessary waste of time and memory
0152 */
0153 private Boolean collectRepositioningInfo = new Boolean(false);
0154
0155 /**
0156 * This is a variable which contains the latest crossed over annotation found
0157 * during export with preserving format, i.e., toXml(annotations) method.
0158 */
0159 private Annotation crossedOverAnnotation = null;
0160
0161
0162 /** Flag to determine whether to serialize namespace information held as
0163 * annotation features into namespace prefix and URI in the XML
0164 */
0165 private boolean serializeNamespaceInfo = false;
0166 /** Feature name used for namespace uri in namespaced elements */
0167 private String namespaceURIFeature = null;
0168 /** Feature name used for namespace prefix in namespaced elements */
0169 private String namespacePrefixFeature = null;
0170
0171
0172 /** Default construction. Content left empty. */
0173 public DocumentImpl() {
0174 content = new DocumentContentImpl();
0175 stringContent = "";
0176
0177 /** We will attempt to serialize namespace if
0178 * three parameters are set in the global or local config file:
0179 * ADD_NAMESPACE_FEATURES: boolean flag
0180 * ELEMENT_NAMESPACE_URI: feature name used to hold namespace uri
0181 * ELEMENT_NAMESPACE_PREFIX: feature name used to hold namespace prefix
0182 */
0183 Map configData = Gate.getUserConfig();
0184
0185 boolean addNSFeature = Boolean.parseBoolean((String)configData.get(GateConstants.ADD_NAMESPACE_FEATURES));
0186 namespaceURIFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_URI);
0187 namespacePrefixFeature = (String) configData.get(GateConstants.ELEMENT_NAMESPACE_PREFIX);
0188
0189 serializeNamespaceInfo = (addNSFeature && namespacePrefixFeature != null && !namespacePrefixFeature.isEmpty() && namespaceURIFeature != null && !namespaceURIFeature.isEmpty());
0190
0191 } // default construction
0192
0193 /** Cover unpredictable Features creation */
0194 public FeatureMap getFeatures() {
0195 if(features == null) {
0196 features = new SimpleFeatureMapImpl();
0197 }
0198 return features;
0199 }
0200
0201 /** Initialise this resource, and return it. */
0202 public Resource init() throws ResourceInstantiationException {
0203 // set up the source URL and create the content
0204 if(sourceUrl == null) {
0205 if(stringContent == null) { throw new ResourceInstantiationException(
0206 "The sourceURL and document's content were null."); }
0207 content = new DocumentContentImpl(stringContent);
0208 getFeatures().put("gate.SourceURL", "created from String");
0209 } else {
0210 try {
0211 content = new DocumentContentImpl(sourceUrl, getEncoding(),
0212 sourceUrlStartOffset, sourceUrlEndOffset);
0213 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
0214 } catch(IOException e) {
0215 throw new ResourceInstantiationException("DocumentImpl.init: " + e);
0216 }
0217 }
0218 if(preserveOriginalContent.booleanValue() && content != null) {
0219 String originalContent = new String(((DocumentContentImpl)content)
0220 .getOriginalContent());
0221 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
0222 originalContent);
0223 } // if
0224 // set up a DocumentFormat if markup unpacking required
0225 if(getMarkupAware().booleanValue()) {
0226 DocumentFormat docFormat = null;
0227 // if a specific MIME type has been given, use it
0228 if(this.mimeType != null && this.mimeType.length() > 0) {
0229 MimeType theType = DocumentFormat.getMimeTypeForString(mimeType);
0230 if(theType == null) {
0231 throw new ResourceInstantiationException("MIME type \""
0232 + this.mimeType + " has no registered DocumentFormat");
0233 }
0234
0235 docFormat = DocumentFormat.getDocumentFormat(this, theType);
0236 }
0237 else {
0238 docFormat = DocumentFormat.getDocumentFormat(this, sourceUrl);
0239 }
0240 try {
0241 if(docFormat != null) {
0242 StatusListener sListener = (StatusListener)gate.Gate
0243 .getListeners().get("gate.event.StatusListener");
0244 if(sListener != null) docFormat.addStatusListener(sListener);
0245 // set the flag if true and if the document format support collecting
0246 docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
0247 if(docFormat.getShouldCollectRepositioning().booleanValue()) {
0248 // unpack with collectiong of repositioning information
0249 RepositioningInfo info = new RepositioningInfo();
0250 String origContent = (String)getFeatures().get(
0251 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
0252 RepositioningInfo ampCodingInfo = new RepositioningInfo();
0253 if(origContent != null) {
0254 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
0255 collectInformationForAmpCodding(origContent, ampCodingInfo,
0256 shouldCorrectCR);
0257 if(docFormat instanceof HtmlDocumentFormat) {
0258 collectInformationForWS(origContent, ampCodingInfo);
0259 } // if
0260 } // if
0261 docFormat.unpackMarkup(this, info, ampCodingInfo);
0262 if(origContent != null && docFormat instanceof XmlDocumentFormat) {
0263 // CRLF correction of RepositioningInfo
0264 correctRepositioningForCRLFInXML(origContent, info);
0265 } // if
0266 getFeatures().put(
0267 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME,
0268 info);
0269 } else {
0270 // normal old fashioned unpack
0271 docFormat.unpackMarkup(this);
0272 }
0273 docFormat.removeStatusListener(sListener);
0274 } // if format != null
0275 } catch(DocumentFormatException e) {
0276 throw new ResourceInstantiationException(
0277 "Couldn't unpack markup in document "
0278 + (sourceUrl != null ? sourceUrl.toExternalForm() : "")
0279 + "!", e);
0280 }
0281 } // if markup aware
0282 // try{
0283 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
0284 // fw.write(getContent().toString());
0285 // fw.flush();
0286 // fw.close();
0287 // }catch(IOException ioe){
0288 // ioe.printStackTrace();
0289 // }
0290 return this;
0291 } // init()
0292
0293 /**
0294 * Correct repositioning information for substitution of "\r\n" with "\n"
0295 */
0296 private void correctRepositioningForCRLFInXML(String content,
0297 RepositioningInfo info) {
0298 int index = -1;
0299 do {
0300 index = content.indexOf("\r\n", index + 1);
0301 if(index != -1) {
0302 info.correctInformationOriginalMove(index, 1);
0303 } // if
0304 } while(index != -1);
0305 } // correctRepositioningForCRLF
0306
0307 /**
0308 * Collect information for substitution of "&xxx;" with "y"
0309 *
0310 * It couldn't be collected a position information about some unicode and
0311 * &-coded symbols during parsing. The parser "hide" the information about the
0312 * position of such kind of parsed text. So, there is minimal chance to have
0313 * &-coded symbol inside the covered by repositioning records area. The new
0314 * record should be created for every coded symbol outside the existing
0315 * records. <BR>
0316 * If <code>shouldCorrectCR</code> flag is <code>true</code> the
0317 * correction for CRLF substitution is performed.
0318 */
0319 private void collectInformationForAmpCodding(String content,
0320 RepositioningInfo info, boolean shouldCorrectCR) {
0321 if(content == null || info == null) return;
0322 int ampIndex = -1;
0323 int semiIndex;
0324 do {
0325 ampIndex = content.indexOf('&', ampIndex + 1);
0326 if(ampIndex != -1) {
0327 semiIndex = content.indexOf(';', ampIndex + 1);
0328 // have semicolon and it is near enough for amp codding
0329 if(semiIndex != -1 && (semiIndex - ampIndex) < 8) {
0330 info.addPositionInfo(ampIndex, semiIndex - ampIndex + 1, 0, 1);
0331 } else {
0332 // no semicolon or it is too far
0333 // analyse for amp codding without semicolon
0334 int maxEnd = Math.min(ampIndex + 8, content.length());
0335 String ampCandidate = content.substring(ampIndex, maxEnd);
0336 int ampCodingSize = analyseAmpCodding(ampCandidate);
0337 if(ampCodingSize != -1) {
0338 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
0339 } // if
0340 } // if - semicolon found
0341 } // if - ampersand found
0342 } while(ampIndex != -1);
0343 // correct the collected information to adjust it's positions
0344 // with reported by the parser
0345 int index = -1;
0346 if(shouldCorrectCR) {
0347 do {
0348 index = content.indexOf("\r\n", index + 1);
0349 if(index != -1) {
0350 info.correctInformationOriginalMove(index, -1);
0351 } // if
0352 } while(index != -1);
0353 } // if
0354 } // collectInformationForAmpCodding
0355
0356 /**
0357 * This function compute size of the ampersand codded sequence when semicolin
0358 * is not present.
0359 */
0360 private int analyseAmpCodding(String content) {
0361 int result = -1;
0362 try {
0363 char ch = content.charAt(1);
0364 switch(ch){
0365 case 'l': // <
0366 case 'L': // <
0367 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
0368 result = 3;
0369 } // if
0370 break;
0371 case 'g': // >
0372 case 'G': // >
0373 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
0374 result = 3;
0375 } // if
0376 break;
0377 case 'a': // &
0378 case 'A': // &
0379 if(content.substring(2, 4).equalsIgnoreCase("mp")) {
0380 result = 4;
0381 } // if
0382 break;
0383 case 'q': // "
0384 case 'Q': // "
0385 if(content.substring(2, 5).equalsIgnoreCase("uot")) {
0386 result = 5;
0387 } // if
0388 break;
0389 case '#': // #number (example ‘, 䰸)
0390 int endIndex = 2;
0391 boolean hexCoded = false;
0392 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
0393 // Hex codding
0394 ++endIndex;
0395 hexCoded = true;
0396 } // if
0397 while(endIndex < 8 && isNumber(content.charAt(endIndex), hexCoded)) {
0398 ++endIndex;
0399 } // while
0400 result = endIndex;
0401 break;
0402 } // switch
0403 } catch(StringIndexOutOfBoundsException ex) {
0404 // do nothing
0405 } // catch
0406 return result;
0407 } // analyseAmpCodding
0408
0409 /** Check for numeric range. If hex is true the A..F range is included */
0410 private boolean isNumber(char ch, boolean hex) {
0411 if(ch >= '0' && ch <= '9') return true;
0412 if(hex) {
0413 if(ch >= 'A' && ch <= 'F') return true;
0414 if(ch >= 'a' && ch <= 'f') return true;
0415 } // if
0416 return false;
0417 } // isNumber
0418
0419 /**
0420 * HTML parser perform substitution of multiple whitespaces (WS) with a single
0421 * WS. To create correct repositioning information structure we should keep
0422 * the information for such multiple WS. <BR>
0423 * The criteria for WS is <code>(ch <= ' ')</code>.
0424 */
0425 private void collectInformationForWS(String content, RepositioningInfo info) {
0426 if(content == null || info == null) return;
0427 // analyse the content and correct the repositioning information
0428 char ch;
0429 int startWS, endWS;
0430 startWS = endWS = -1;
0431 int contentLength = content.length();
0432 for(int i = 0; i < contentLength; ++i) {
0433 ch = content.charAt(i);
0434 // is whitespace
0435 if(ch <= ' ') {
0436 if(startWS == -1) {
0437 startWS = i;
0438 } // if
0439 endWS = i;
0440 } else {
0441 if(endWS - startWS > 0) {
0442 // put the repositioning information about the WS substitution
0443 info
0444 .addPositionInfo((long)startWS, (long)(endWS - startWS + 1),
0445 0, 1);
0446 } // if
0447 // clear positions
0448 startWS = endWS = -1;
0449 }// if
0450 } // for
0451 } // collectInformationForWS
0452
0453 /** Clear all the data members of the object. */
0454 public void cleanup() {
0455 defaultAnnots = null;
0456 if((namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
0457 namedAnnotSets.clear();
0458 if(DEBUG) Out.prln("Document cleanup called");
0459 if(this.lrPersistentId != null)
0460 Gate.getCreoleRegister().removeCreoleListener(this);
0461 if(this.getDataStore() != null)
0462 this.getDataStore().removeDatastoreListener(this);
0463 } // cleanup()
0464
0465
0466 /** Get the specific MIME type for this document, if set */
0467 public String getMimeType() {
0468 return mimeType;
0469 }
0470
0471 /** Set the specific MIME type for this document */
0472 @Optional
0473 @CreoleParameter(
0474 comment = "MIME type of the document. If unspecified it will be "
0475 + "inferred from the file extension, etc.")
0476 public void setMimeType(String newMimeType) {
0477 this.mimeType = newMimeType;
0478 }
0479
0480 /** Documents are identified by URLs */
0481 public URL getSourceUrl() {
0482 return sourceUrl;
0483 }
0484
0485 /** Set method for the document's URL */
0486 @CreoleParameter(disjunction = "source", priority = 1, comment = "Source URL",
0487 suffixes = "txt;text;xml;xhtm;xhtml;html;htm;sgml;sgm;mail;email;eml;rtf;pdf;doc;ppt;pptx;docx;xls;xlsx;ods;odt;odp")
0488 public void setSourceUrl(URL sourceUrl) {
0489 this.sourceUrl = sourceUrl;
0490 } // setSourceUrl
0491
0492 /**
0493 * Documents may be packed within files; in this case an optional pair of
0494 * offsets refer to the location of the document.
0495 */
0496 public Long[] getSourceUrlOffsets() {
0497 Long[] sourceUrlOffsets = new Long[2];
0498 sourceUrlOffsets[0] = sourceUrlStartOffset;
0499 sourceUrlOffsets[1] = sourceUrlEndOffset;
0500 return sourceUrlOffsets;
0501 } // getSourceUrlOffsets
0502
0503 /**
0504 * Allow/disallow preserving of the original document content. If is <B>true</B>
0505 * the original content will be retrieved from the DocumentContent object and
0506 * preserved as document feature.
0507 */
0508 @CreoleParameter(comment = "Should the document preserve the original content?",
0509 defaultValue = "false")
0510 public void setPreserveOriginalContent(Boolean b) {
0511 preserveOriginalContent = b;
0512 } // setPreserveOriginalContent
0513
0514 /**
0515 * Get the preserving of content status of the Document.
0516 *
0517 * @return whether the Document should preserve it's original content.
0518 */
0519 public Boolean getPreserveOriginalContent() {
0520 return preserveOriginalContent;
0521 } // getPreserveOriginalContent
0522
0523 /**
0524 * Allow/disallow collecting of repositioning information. If is <B>true</B>
0525 * information will be retrieved and preserved as document feature.<BR>
0526 * Preserving of repositioning information give the possibilities for
0527 * converting of coordinates between the original document content and
0528 * extracted from the document text.
0529 */
0530 @CreoleParameter(defaultValue = "false",
0531 comment = "Should the document collect repositioning information")
0532 public void setCollectRepositioningInfo(Boolean b) {
0533 collectRepositioningInfo = b;
0534 } // setCollectRepositioningInfo
0535
0536 /**
0537 * Get the collectiong and preserving of repositioning information for the
0538 * Document. <BR>
0539 * Preserving of repositioning information give the possibilities for
0540 * converting of coordinates between the original document content and
0541 * extracted from the document text.
0542 *
0543 * @return whether the Document should collect and preserve information.
0544 */
0545 public Boolean getCollectRepositioningInfo() {
0546 return collectRepositioningInfo;
0547 } // getCollectRepositioningInfo
0548
0549 /**
0550 * Documents may be packed within files; in this case an optional pair of
0551 * offsets refer to the location of the document. This method gets the start
0552 * offset.
0553 */
0554 public Long getSourceUrlStartOffset() {
0555 return sourceUrlStartOffset;
0556 }
0557
0558 /**
0559 * Documents may be packed within files; in this case an optional pair of
0560 * offsets refer to the location of the document. This method sets the start
0561 * offset.
0562 */
0563 @Optional
0564 @CreoleParameter(
0565 comment = "Start offset for documents based on ranges")
0566 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
0567 this.sourceUrlStartOffset = sourceUrlStartOffset;
0568 } // setSourceUrlStartOffset
0569
0570 /**
0571 * Documents may be packed within files; in this case an optional pair of
0572 * offsets refer to the location of the document. This method gets the end
0573 * offset.
0574 */
0575 public Long getSourceUrlEndOffset() {
0576 return sourceUrlEndOffset;
0577 }
0578
0579 /**
0580 * Documents may be packed within files; in this case an optional pair of
0581 * offsets refer to the location of the document. This method sets the end
0582 * offset.
0583 */
0584 @Optional
0585 @CreoleParameter(
0586 comment = "End offset for documents based on ranges")
0587 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
0588 this.sourceUrlEndOffset = sourceUrlEndOffset;
0589 } // setSourceUrlStartOffset
0590
0591 /** The content of the document: a String for text; MPEG for video; etc. */
0592 public DocumentContent getContent() {
0593 return content;
0594 }
0595
0596 /** Set method for the document content */
0597 public void setContent(DocumentContent content) {
0598 this.content = content;
0599 // stringContent is a parameter, not a normal field, and
0600 // should not be overwritten here.
0601 //this.stringContent = content.toString();
0602 }
0603
0604 /** Get the encoding of the document content source */
0605 public String getEncoding() {
0606 // we need to make sure we ALWAYS have an encoding
0607 if(encoding == null || encoding.trim().length() == 0) {
0608 // no encoding definded: use the platform default
0609 encoding = java.nio.charset.Charset.forName(
0610 System.getProperty("file.encoding")).name();
0611 }
0612 return encoding;
0613 }
0614
0615 /** Set the encoding of the document content source */
0616 @Optional
0617 @CreoleParameter(comment = "Encoding")
0618 public void setEncoding(String encoding) {
0619 this.encoding = encoding;
0620 }
0621
0622 /**
0623 * Get the default set of annotations. The set is created if it doesn't exist
0624 * yet.
0625 */
0626 public AnnotationSet getAnnotations() {
0627 if(defaultAnnots == null) {
0628 defaultAnnots = new AnnotationSetImpl(this);
0629 fireAnnotationSetAdded(new DocumentEvent(this,
0630 DocumentEvent.ANNOTATION_SET_ADDED, null));
0631 }// if
0632 return defaultAnnots;
0633 } // getAnnotations()
0634
0635 /**
0636 * Get a named set of annotations. Creates a new set if one with this name
0637 * doesn't exist yet. If the provided name is null or the empty string then
0638 * it returns the default annotation set.
0639 */
0640 public AnnotationSet getAnnotations(String name) {
0641 if(name == null || "".equals(name)) return getAnnotations();
0642 if(namedAnnotSets == null) {
0643 namedAnnotSets = new HashMap<String, AnnotationSet>();
0644 }
0645 AnnotationSet namedSet = namedAnnotSets.get(name);
0646 if(namedSet == null) {
0647 namedSet = new AnnotationSetImpl(this, name);
0648 namedAnnotSets.put(name, namedSet);
0649 DocumentEvent evt = new DocumentEvent(this,
0650 DocumentEvent.ANNOTATION_SET_ADDED, name);
0651 fireAnnotationSetAdded(evt);
0652 }
0653 return namedSet;
0654 } // getAnnotations(name)
0655
0656 /**
0657 * Make the document markup-aware. This will trigger the creation of a
0658 * DocumentFormat object at Document initialisation time; the DocumentFormat
0659 * object will unpack the markup in the Document and add it as annotations.
0660 * Documents are <B>not</B> markup-aware by default.
0661 *
0662 * @param newMarkupAware
0663 * markup awareness status.
0664 */
0665 @CreoleParameter(defaultValue = "true",
0666 comment = "Should the document read the original markup?")
0667 public void setMarkupAware(Boolean newMarkupAware) {
0668 this.markupAware = newMarkupAware;
0669 }
0670
0671 /**
0672 * Get the markup awareness status of the Document. <B>Documents are
0673 * markup-aware by default.</B>
0674 *
0675 * @return whether the Document is markup aware.
0676 */
0677 public Boolean getMarkupAware() {
0678 return markupAware;
0679 }
0680
0681 /**
0682 * Returns an XML document aming to preserve the original markups( the
0683 * original markup will be in the same place and format as it was before
0684 * processing the document) and include (if possible) the annotations
0685 * specified in the aSourceAnnotationSet. It is equivalent to
0686 * toXml(aSourceAnnotationSet, true).
0687 */
0688 public String toXml(Set aSourceAnnotationSet) {
0689 return toXml(aSourceAnnotationSet, true);
0690 }
0691
0692 /**
0693 * Returns an XML document aming to preserve the original markups( the
0694 * original markup will be in the same place and format as it was before
0695 * processing the document) and include (if possible) the annotations
0696 * specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
0697 * the aSourceAnnotationSet will be lost if they will cause a crosed over
0698 * situation.
0699 *
0700 * @param aSourceAnnotationSet
0701 * is an annotation set containing all the annotations that will be
0702 * combined with the original marup set. If the param is
0703 * <code>null</code> it will only dump the original markups.
0704 * @param includeFeatures
0705 * is a boolean that controls whether the annotation features should
0706 * be included or not. If false, only the annotation type is included
0707 * in the tag.
0708 * @return a string representing an XML document containing the original
0709 * markup + dumped annotations form the aSourceAnnotationSet
0710 */
0711 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures) {
0712 if(hasOriginalContentFeatures()) { return saveAnnotationSetAsXmlInOrig(
0713 aSourceAnnotationSet, includeFeatures); } // if
0714 AnnotationSet originalMarkupsAnnotSet = this
0715 .getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
0716 // Create a dumping annotation set on the document. It will be used for
0717 // dumping annotations...
0718 // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
0719 List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
0720 // This set will be constructed inside this method. If is not empty, the
0721 // annotation contained will be lost.
0722 /*
0723 * if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
0724 * was not empty."+ "All annotation it contained were lost.");
0725 * dumpingSet.clear(); }// End if
0726 */
0727 StatusListener sListener = (StatusListener)gate.Gate
0728 .getListeners().get("gate.event.StatusListener");
0729 // Construct the dumping set in that way that all annotations will verify
0730 // the condition that there are not annotations which are crossed.
0731 // First add all annotation from the original markups
0732 if(sListener != null)
0733 sListener.statusChanged("Constructing the dumping annotation set.");
0734 // dumpingSet.addAll(originalMarkupsAnnotSet);
0735 dumpingList.addAll(originalMarkupsAnnotSet);
0736 // Then take all the annotations from aSourceAnnotationSet and verify if
0737 // they can be inserted safely into the dumpingSet. Where not possible,
0738 // report.
0739 if(aSourceAnnotationSet != null) {
0740 Iterator iter = aSourceAnnotationSet.iterator();
0741 while(iter.hasNext()) {
0742 Annotation currentAnnot = (Annotation)iter.next();
0743 if(insertsSafety(dumpingList, currentAnnot)) {
0744 // dumpingSet.add(currentAnnot);
0745 dumpingList.add(currentAnnot);
0746 } else if(crossedOverAnnotation != null && DEBUG) {
0747 try {
0748 Out.prln("Warning: Annotations were found to violate the "
0749 + "crossed over condition: \n"
0750 + "1. ["
0751 + getContent().getContent(
0752 crossedOverAnnotation.getStartNode().getOffset(),
0753 crossedOverAnnotation.getEndNode().getOffset())
0754 + " ("
0755 + crossedOverAnnotation.getType()
0756 + ": "
0757 + crossedOverAnnotation.getStartNode().getOffset()
0758 + ";"
0759 + crossedOverAnnotation.getEndNode().getOffset()
0760 + ")]\n"
0761 + "2. ["
0762 + getContent().getContent(
0763 currentAnnot.getStartNode().getOffset(),
0764 currentAnnot.getEndNode().getOffset()) + " ("
0765 + currentAnnot.getType() + ": "
0766 + currentAnnot.getStartNode().getOffset() + ";"
0767 + currentAnnot.getEndNode().getOffset()
0768 + ")]\nThe second one will be discarded.\n");
0769 } catch(gate.util.InvalidOffsetException ex) {
0770 throw new GateRuntimeException(ex.getMessage());
0771 }
0772 }// End if
0773 }// End while
0774 }// End if
0775 // kalina: order the dumping list by start offset
0776 Collections.sort(dumpingList, new gate.util.OffsetComparator());
0777 // The dumpingSet is ready to be exported as XML
0778 // Here we go.
0779 if(sListener != null)
0780 sListener.statusChanged("Dumping annotations as XML");
0781 StringBuffer xmlDoc = new StringBuffer(
0782 DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR
0783 * (this.getContent().size().intValue()));
0784 // Add xml header if original format was xml
0785 String mimeType = getFeatures() == null ? null : (String)getFeatures().get(
0786 "MimeType");
0787 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
0788 if(wasXML) {
0789 xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
0790 xmlDoc.append(getEncoding());
0791 xmlDoc.append("\" ?>");
0792 xmlDoc.append(Strings.getNl());
0793 }// ENd if
0794 // Identify and extract the root annotation from the dumpingSet.
0795 theRootAnnotation = identifyTheRootAnnotation(dumpingList);
0796 // If a root annotation has been identified then add it explicitly at the
0797 // beginning of the document
0798 if(theRootAnnotation != null) {
0799 dumpingList.remove(theRootAnnotation);
0800 xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
0801 }// End if
0802 // Construct and append the rest of the document
0803 xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
0804 // If a root annotation has been identified then add it eplicitley at the
0805 // end of the document
0806 if(theRootAnnotation != null) {
0807 xmlDoc.append(writeEndTag(theRootAnnotation));
0808 }// End if
0809 if(sListener != null) sListener.statusChanged("Done.");
0810 return xmlDoc.toString();
0811 }// End toXml()
0812
0813 /**
0814 * This method verifies if aSourceAnnotation can ve inserted safety into the
0815 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
0816 * contition with any annotation from the aTargetAnnotSet.
0817 *
0818 * @param aTargetAnnotSet
0819 * the annotation set to include the aSourceAnnotation
0820 * @param aSourceAnnotation
0821 * the annotation to be inserted into the aTargetAnnotSet
0822 * @return true if the annotation inserts safety, or false otherwise.
0823 */
0824 private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
0825 Annotation aSourceAnnotation) {
0826 if(aTargetAnnotSet == null || aSourceAnnotation == null) {
0827 this.crossedOverAnnotation = null;
0828 return false;
0829 }
0830 if(aSourceAnnotation.getStartNode() == null
0831 || aSourceAnnotation.getStartNode().getOffset() == null) {
0832 this.crossedOverAnnotation = null;
0833 return false;
0834 }
0835 if(aSourceAnnotation.getEndNode() == null
0836 || aSourceAnnotation.getEndNode().getOffset() == null) {
0837 this.crossedOverAnnotation = null;
0838 return false;
0839 }
0840 // Get the start and end offsets
0841 Long start = aSourceAnnotation.getStartNode().getOffset();
0842 Long end = aSourceAnnotation.getEndNode().getOffset();
0843 // Read aSourceAnnotation offsets long
0844 long s2 = start.longValue();
0845 long e2 = end.longValue();
0846 // Obtain a set with all annotations annotations that overlap
0847 // totaly or partially with the interval defined by the two provided offsets
0848 AnnotationSet as = aTargetAnnotSet.get(start, end);
0849 // Investigate all the annotations from as to see if there is one that
0850 // comes in conflict with aSourceAnnotation
0851 Iterator<Annotation> it = as.iterator();
0852 while(it.hasNext()) {
0853 Annotation ann = it.next();
0854 // Read ann offsets
0855 long s1 = ann.getStartNode().getOffset().longValue();
0856 long e1 = ann.getEndNode().getOffset().longValue();
0857 if(s1 < s2 && s2 < e1 && e1 < e2) {
0858 this.crossedOverAnnotation = ann;
0859 return false;
0860 }
0861 if(s2 < s1 && s1 < e2 && e2 < e1) {
0862 this.crossedOverAnnotation = ann;
0863 return false;
0864 }
0865 }// End while
0866 return true;
0867 }// insertsSafety()
0868
0869 private boolean insertsSafety(List aTargetAnnotList,
0870 Annotation aSourceAnnotation) {
0871 if(aTargetAnnotList == null || aSourceAnnotation == null) {
0872 this.crossedOverAnnotation = null;
0873 return false;
0874 }
0875 if(aSourceAnnotation.getStartNode() == null
0876 || aSourceAnnotation.getStartNode().getOffset() == null) {
0877 this.crossedOverAnnotation = null;
0878 return false;
0879 }
0880 if(aSourceAnnotation.getEndNode() == null
0881 || aSourceAnnotation.getEndNode().getOffset() == null) {
0882 this.crossedOverAnnotation = null;
0883 return false;
0884 }
0885 // Get the start and end offsets
0886 Long start = aSourceAnnotation.getStartNode().getOffset();
0887 Long end = aSourceAnnotation.getEndNode().getOffset();
0888 // Read aSourceAnnotation offsets long
0889 long s2 = start.longValue();
0890 long e2 = end.longValue();
0891 // Obtain a set with all annotations annotations that overlap
0892 // totaly or partially with the interval defined by the two provided offsets
0893 List<Annotation> as = new ArrayList<Annotation>();
0894 for(int i = 0; i < aTargetAnnotList.size(); i++) {
0895 Annotation annot = (Annotation) aTargetAnnotList.get(i);
0896 if(annot.getStartNode().getOffset().longValue() >= s2
0897 && annot.getStartNode().getOffset().longValue() <= e2)
0898 as.add(annot);
0899 else if(annot.getEndNode().getOffset().longValue() >= s2
0900 && annot.getEndNode().getOffset().longValue() <= e2)
0901 as.add(annot);
0902 }
0903 // Investigate all the annotations from as to see if there is one that
0904 // comes in conflict with aSourceAnnotation
0905 Iterator<Annotation> it = as.iterator();
0906 while(it.hasNext()) {
0907 Annotation ann = it.next();
0908 // Read ann offsets
0909 long s1 = ann.getStartNode().getOffset().longValue();
0910 long e1 = ann.getEndNode().getOffset().longValue();
0911 if(s1 < s2 && s2 < e1 && e1 < e2) {
0912 this.crossedOverAnnotation = ann;
0913 return false;
0914 }
0915 if(s2 < s1 && s1 < e2 && e2 < e1) {
0916 this.crossedOverAnnotation = ann;
0917 return false;
0918 }
0919 }// End while
0920 return true;
0921 }// insertsSafety()
0922
0923 /**
0924 * This method saves all the annotations from aDumpAnnotSet and combines them
0925 * with the document content.
0926 *
0927 * @param aDumpAnnotSet
0928 * is a GATE annotation set prepared to be used on the raw text from
0929 * document content. If aDumpAnnotSet is <b>null<b> then an empty
0930 * string will be returned.
0931 * @param includeFeatures
0932 * is a boolean, which controls whether the annotation features and
0933 * gate ID are included or not.
0934 * @return The XML document obtained from raw text + the information from the
0935 * dump annotation set.
0936 */
0937 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
0938 boolean includeFeatures) {
0939 String content = null;
0940 if(this.getContent() == null)
0941 content = new String("");
0942 else content = this.getContent().toString();
0943 StringBuffer docContStrBuff =
0944 DocumentXmlUtils.filterNonXmlChars(new StringBuffer(content));
0945 if(aDumpAnnotSet == null) return docContStrBuff.toString();
0946 TreeMap offsets2CharsMap = new TreeMap();
0947 if(this.getContent().size().longValue() != 0) {
0948 // Fill the offsets2CharsMap with all the indices where
0949 // special chars appear
0950 buildEntityMapFromString(content, offsets2CharsMap);
0951 }// End if
0952 // The saving alghorithm is as follows:
0953 // /////////////////////////////////////////
0954 // Construct a set of annot with all IDs in asc order.
0955 // All annotations that end at that offset swap their place in descending
0956 // order. For each node write all the tags from left to right.
0957 // Construct the node set
0958 TreeSet offsets = new TreeSet();
0959 Iterator<Annotation> iter = aDumpAnnotSet.iterator();
0960 while(iter.hasNext()) {
0961 Annotation annot = iter.next();
0962 offsets.add(annot.getStartNode().getOffset());
0963 offsets.add(annot.getEndNode().getOffset());
0964 }// End while
0965 // ofsets is sorted in ascending order.
0966 // Iterate this set in descending order and remove an offset at each
0967 // iteration
0968 while(!offsets.isEmpty()) {
0969 Long offset = (Long)offsets.last();
0970 // Remove the offset from the set
0971 offsets.remove(offset);
0972 // Now, use it.
0973 // Returns a list with annotations that needs to be serialized in that
0974 // offset.
0975 List annotations = getAnnotationsForOffset(aDumpAnnotSet, offset);
0976 // Attention: the annotation are serialized from left to right
0977 // StringBuffer tmpBuff = new StringBuffer("");
0978 StringBuffer tmpBuff = new StringBuffer(DOC_SIZE_MULTIPLICATION_FACTOR_AS
0979 * (this.getContent().size().intValue()));
0980 Stack<Annotation> stack = new Stack<Annotation>();
0981 // Iterate through all these annotations and serialize them
0982 Iterator it = annotations.iterator();
0983 while(it.hasNext()) {
0984 Annotation a = (Annotation) it.next();
0985 it.remove();
0986 // Test if a Ends at offset
0987 if(offset.equals(a.getEndNode().getOffset())) {
0988 // Test if a Starts at offset
0989 if(offset.equals(a.getStartNode().getOffset())) {
0990 // Here, the annotation a Starts and Ends at the offset
0991 if(null != a.getFeatures().get("isEmptyAndSpan")
0992 && "true".equals((String)a.getFeatures().get(
0993 "isEmptyAndSpan"))) {
0994 // Assert: annotation a with start == end and isEmptyAndSpan
0995 tmpBuff.append(writeStartTag(a, includeFeatures));
0996 stack.push(a);
0997 } else {
0998 // Assert annotation a with start == end and an empty tag
0999 tmpBuff.append(writeEmptyTag(a));
1000 // The annotation is removed from dumped set
1001 aDumpAnnotSet.remove(a);
1002 }// End if
1003 } else {
1004 // Here the annotation a Ends at the offset.
1005 // In this case empty the stack and write the end tag
1006 if(!stack.isEmpty()) {
1007 while(!stack.isEmpty()) {
1008 Annotation a1 = stack.pop();
1009 tmpBuff.append(writeEndTag(a1));
1010 }// End while
1011 }// End if
1012 tmpBuff.append(writeEndTag(a));
1013 }// End if
1014 } else {
1015 // The annotation a does NOT end at the offset. Let's see if it starts
1016 // at the offset
1017 if(offset.equals(a.getStartNode().getOffset())) {
1018 // The annotation a starts at the offset.
1019 // In this case empty the stack and write the end tag
1020 if(!stack.isEmpty()) {
1021 while(!stack.isEmpty()) {
1022 Annotation a1 = stack.pop();
1023 tmpBuff.append(writeEndTag(a1));
1024 }// End while
1025 }// End if
1026 tmpBuff.append(writeStartTag(a, includeFeatures));
1027 // The annotation is removed from dumped set
1028 aDumpAnnotSet.remove(a);
1029 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1030 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1031 }// End while(it.hasNext()){
1032 // In this case empty the stack and write the end tag
1033 if(!stack.isEmpty()) {
1034 while(!stack.isEmpty()) {
1035 Annotation a1 = stack.pop();
1036 tmpBuff.append(writeEndTag(a1));
1037 }// End while
1038 }// End if
1039 // Before inserting tmpBuff into docContStrBuff we need to check
1040 // if there are chars to be replaced and if there are, they would be
1041 // replaced.
1042 if(!offsets2CharsMap.isEmpty()) {
1043 Long offsChar = (Long)offsets2CharsMap.lastKey();
1044 while(!offsets2CharsMap.isEmpty()
1045 && offsChar.intValue() >= offset.intValue()) {
1046 // Replace the char at offsChar with its corresponding entity form
1047 // the entitiesMap.
1048 docContStrBuff.replace(offsChar.intValue(), offsChar.intValue() + 1,
1049 (String)DocumentXmlUtils.entitiesMap.get((Character)offsets2CharsMap
1050 .get(offsChar)));
1051 // Discard the offsChar after it was used.
1052 offsets2CharsMap.remove(offsChar);
1053 // Investigate next offsChar
1054 if(!offsets2CharsMap.isEmpty())
1055 offsChar = (Long)offsets2CharsMap.lastKey();
1056 }// End while
1057 }// End if
1058 // Insert tmpBuff to the location where it belongs in docContStrBuff
1059 docContStrBuff.insert(offset.intValue(), tmpBuff.toString());
1060 }// End while(!offsets.isEmpty())
1061 // Need to replace the entities in the remaining text, if there is any text
1062 // So, if there are any more items in offsets2CharsMap they need to be
1063 // replaced
1064 while(!offsets2CharsMap.isEmpty()) {
1065 Long offsChar = (Long)offsets2CharsMap.lastKey();
1066 // Replace the char with its entity
1067 docContStrBuff.replace(offsChar.intValue(), offsChar.intValue() + 1,
1068 (String)DocumentXmlUtils.entitiesMap
1069 .get((Character)offsets2CharsMap.get(offsChar)));
1070 // remove the offset from the map
1071 offsets2CharsMap.remove(offsChar);
1072 }// End while
1073 return docContStrBuff.toString();
1074 }// saveAnnotationSetAsXml()
1075
1076 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1077 boolean includeFeatures) {
1078 String content;
1079 if(this.getContent() == null)
1080 content = "";
1081 else content = this.getContent().toString();
1082 StringBuffer docContStrBuff =
1083 DocumentXmlUtils.filterNonXmlChars(new StringBuffer(content));
1084 if(aDumpAnnotList == null) return docContStrBuff.toString();
1085 StringBuffer resultStrBuff = new StringBuffer(
1086 DOC_SIZE_MULTIPLICATION_FACTOR_AS
1087 * (this.getContent().size().intValue()));
1088 // last offset position used to extract portions of text
1089 Long lastOffset = 0L;
1090 TreeMap<Long, Character> offsets2CharsMap = new TreeMap<Long, Character>();
1091 HashMap<Long, List<Annotation>> annotsForOffset =
1092 new HashMap<Long, List<Annotation>>(100);
1093 if(this.getContent().size() != 0) {
1094 // Fill the offsets2CharsMap with all the indices where
1095 // special chars appear
1096 buildEntityMapFromString(content, offsets2CharsMap);
1097 }// End if
1098 // The saving alghorithm is as follows:
1099 // /////////////////////////////////////////
1100 // Construct a set of annot with all IDs in asc order.
1101 // All annotations that end at that offset swap their place in descending
1102 // order. For each node write all the tags from left to right.
1103 // Construct the node set
1104 TreeSet<Long> offsets = new TreeSet<Long>();
1105 Iterator iter = aDumpAnnotList.iterator();
1106 Annotation annot;
1107 Long start;
1108 Long end;
1109 while(iter.hasNext()) {
1110 annot = (Annotation)iter.next();
1111 start = annot.getStartNode().getOffset();
1112 end = annot.getEndNode().getOffset();
1113 offsets.add(start);
1114 offsets.add(end);
1115 if(annotsForOffset.containsKey(start)) {
1116 annotsForOffset.get(start).add(annot);
1117 } else {
1118 List<Annotation> newList = new ArrayList<Annotation>(10);
1119 newList.add(annot);
1120 annotsForOffset.put(start, newList);
1121 }
1122 if(annotsForOffset.containsKey(end)) {
1123 annotsForOffset.get(end).add(annot);
1124 } else {
1125 List<Annotation> newList = new ArrayList<Annotation>(10);
1126 newList.add(annot);
1127 annotsForOffset.put(end, newList);
1128 }
1129 }// End while
1130 // ofsets is sorted in ascending order.
1131 // Iterate this set in descending order and remove an offset at each
1132 // iteration
1133 Iterator offsetIt = offsets.iterator();
1134 Long offset;
1135 List annotations;
1136 // This don't have to be a large buffer - just for tags
1137 StringBuffer tmpBuff = new StringBuffer(255);
1138 Stack<Annotation> stack = new Stack<Annotation>();
1139 while(offsetIt.hasNext()) {
1140 offset = (Long)offsetIt.next();
1141 // Now, use it.
1142 // Returns a list with annotations that needs to be serialized in that
1143 // offset.
1144 annotations = (List)annotsForOffset.get(offset);
1145 // order annotations in list for offset to print tags in correct order
1146 annotations = getAnnotationsForOffset(annotations, offset);
1147 // clear structures
1148 tmpBuff.setLength(0);
1149 stack.clear();
1150 // Iterate through all these annotations and serialize them
1151 Iterator it = annotations.iterator();
1152 Annotation a;
1153 Annotation annStack;
1154 while(it.hasNext()) {
1155 a = (Annotation)it.next();
1156 // Test if a Ends at offset
1157 if(offset.equals(a.getEndNode().getOffset())) {
1158 // Test if a Starts at offset
1159 if(offset.equals(a.getStartNode().getOffset())) {
1160 // Here, the annotation a Starts and Ends at the offset
1161 if(null != a.getFeatures().get("isEmptyAndSpan")
1162 && "true".equals((String)a.getFeatures().get(
1163 "isEmptyAndSpan"))) {
1164 // Assert: annotation a with start == end and isEmptyAndSpan
1165 tmpBuff.append(writeStartTag(a, includeFeatures));
1166 stack.push(a);
1167 } else {
1168 // Assert annotation a with start == end and an empty tag
1169 tmpBuff.append(writeEmptyTag(a));
1170 // The annotation is removed from dumped set
1171 aDumpAnnotList.remove(a);
1172 }// End if
1173 } else {
1174 // Here the annotation a Ends at the offset.
1175 // In this case empty the stack and write the end tag
1176 if(!stack.isEmpty()) {
1177 while(!stack.isEmpty()) {
1178 annStack = stack.pop();
1179 tmpBuff.append(writeEndTag(annStack));
1180 }// End while
1181 }// End if
1182 tmpBuff.append(writeEndTag(a));
1183 }// End if
1184 } else {
1185 // The annotation a does NOT end at the offset. Let's see if it starts
1186 // at the offset
1187 if(offset.equals(a.getStartNode().getOffset())) {
1188 // The annotation a starts at the offset.
1189 // In this case empty the stack and write the end tag
1190 if(!stack.isEmpty()) {
1191 while(!stack.isEmpty()) {
1192 annStack = stack.pop();
1193 tmpBuff.append(writeEndTag(annStack));
1194 }// End while
1195 }// End if
1196 tmpBuff.append(writeStartTag(a, includeFeatures));
1197 // The annotation is removed from dumped set
1198 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1199 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1200 }// End while(it.hasNext()){
1201 // In this case empty the stack and write the end tag
1202 if(!stack.isEmpty()) {
1203 while(!stack.isEmpty()) {
1204 annStack = stack.pop();
1205 tmpBuff.append(writeEndTag(annStack));
1206 }// End while
1207 }// End if
1208 // extract text from content and replace spec chars
1209 StringBuffer partText = new StringBuffer();
1210 SortedMap offsetsInRange = offsets2CharsMap.subMap(lastOffset, offset);
1211 Long tmpOffset;
1212 Long tmpLastOffset = lastOffset;
1213 String replacement;
1214 // Before inserting tmpBuff into the buffer we need to check
1215 // if there are chars to be replaced in range
1216 while(!offsetsInRange.isEmpty()) {
1217 tmpOffset = (Long)offsetsInRange.firstKey();
1218 replacement = (String)DocumentXmlUtils.entitiesMap.get(
1219 offsets2CharsMap.get(tmpOffset));
1220 partText.append(docContStrBuff.substring(
1221 tmpLastOffset.intValue(), tmpOffset.intValue()));
1222 partText.append(replacement);
1223 tmpLastOffset = tmpOffset + 1;
1224 offsetsInRange.remove(tmpOffset);
1225 }
1226 partText.append(docContStrBuff.substring(
1227 tmpLastOffset.intValue(), offset.intValue()));
1228 resultStrBuff.append(partText);
1229 // Insert tmpBuff to the result string
1230 resultStrBuff.append(tmpBuff.toString());
1231 lastOffset = offset;
1232 }// End while(!offsets.isEmpty())
1233 // get text to the end of content
1234 // extract text from content and replace spec chars
1235 StringBuffer partText = new StringBuffer();
1236 SortedMap offsetsInRange = offsets2CharsMap.subMap(
1237 lastOffset, (long) docContStrBuff.length());
1238 Long tmpOffset;
1239 Long tmpLastOffset = lastOffset;
1240 String replacement;
1241 // Need to replace the entities in the remaining text, if there is any text
1242 // So, if there are any more items in offsets2CharsMap for remaining text
1243 // they need to be replaced
1244 while(!offsetsInRange.isEmpty()) {
1245 tmpOffset = (Long)offsetsInRange.firstKey();
1246 replacement = (String)DocumentXmlUtils.entitiesMap.get(
1247 offsets2CharsMap.get(tmpOffset));
1248 partText.append(docContStrBuff.substring(
1249 tmpLastOffset.intValue(), tmpOffset.intValue()));
1250 partText.append(replacement);
1251 tmpLastOffset = tmpOffset + 1;
1252 offsetsInRange.remove(tmpOffset);
1253 }
1254 partText.append(docContStrBuff.substring(
1255 tmpLastOffset.intValue(), docContStrBuff.length()));
1256 resultStrBuff.append(partText);
1257 return resultStrBuff.toString();
1258 }// saveAnnotationSetAsXml()
1259
1260 /*
1261 * Old method created by Cristian. Create content backward.
1262 *
1263 * private String saveAnnotationSetAsXml(List aDumpAnnotList, boolean
1264 * includeFeatures){ String content = null; if (this.getContent()== null)
1265 * content = new String(""); else content = this.getContent().toString();
1266 * StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1267 * if (aDumpAnnotList == null) return docContStrBuff.toString();
1268 *
1269 * TreeMap offsets2CharsMap = new TreeMap(); HashMap annotsForOffset = new
1270 * HashMap(100); if (this.getContent().size().longValue() != 0){ // Fill the
1271 * offsets2CharsMap with all the indices where // special chars appear
1272 * buildEntityMapFromString(content,offsets2CharsMap); }//End if // The saving
1273 * alghorithm is as follows: /////////////////////////////////////////// //
1274 * Construct a set of annot with all IDs in asc order. // All annotations that
1275 * end at that offset swap their place in descending // order. For each node
1276 * write all the tags from left to right. // Construct the node set TreeSet
1277 * offsets = new TreeSet(); Iterator iter = aDumpAnnotList.iterator(); while
1278 * (iter.hasNext()){ Annotation annot = (Annotation) iter.next();
1279 * offsets.add(annot.getStartNode().getOffset());
1280 * offsets.add(annot.getEndNode().getOffset()); if
1281 * (annotsForOffset.containsKey(annot.getStartNode().getOffset())) { ((List)
1282 * annotsForOffset.get(annot.getStartNode().getOffset())).add(annot); } else {
1283 * List newList = new ArrayList(10); newList.add(annot);
1284 * annotsForOffset.put(annot.getStartNode().getOffset(), newList); } if
1285 * (annotsForOffset.containsKey(annot.getEndNode().getOffset())) { ((List)
1286 * annotsForOffset.get(annot.getEndNode().getOffset())).add(annot); } else {
1287 * List newList = new ArrayList(10); newList.add(annot);
1288 * annotsForOffset.put(annot.getEndNode().getOffset(), newList); } }// End
1289 * while // ofsets is sorted in ascending order. // Iterate this set in
1290 * descending order and remove an offset at each // iteration while
1291 * (!offsets.isEmpty()){ Long offset = (Long)offsets.last(); // Remove the
1292 * offset from the set offsets.remove(offset); // Now, use it. // Returns a
1293 * list with annotations that needs to be serialized in that // offset. //
1294 * List annotations = getAnnotationsForOffset(aDumpAnnotList,offset); List
1295 * annotations = (List) annotsForOffset.get(offset); annotations =
1296 * getAnnotationsForOffset(annotations,offset); // Attention: the annotation
1297 * are serialized from left to right // StringBuffer tmpBuff = new
1298 * StringBuffer(""); StringBuffer tmpBuff = new StringBuffer(
1299 * DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1300 * Stack stack = new Stack(); // Iterate through all these annotations and
1301 * serialize them Iterator it = annotations.iterator(); while(it.hasNext()){
1302 * Annotation a = (Annotation) it.next(); it.remove(); // Test if a Ends at
1303 * offset if ( offset.equals(a.getEndNode().getOffset()) ){ // Test if a
1304 * Starts at offset if ( offset.equals(a.getStartNode().getOffset()) ){ //
1305 * Here, the annotation a Starts and Ends at the offset if ( null !=
1306 * a.getFeatures().get("isEmptyAndSpan") &&
1307 * "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ // Assert:
1308 * annotation a with start == end and isEmptyAndSpan
1309 * tmpBuff.append(writeStartTag(a, includeFeatures)); stack.push(a); }else{ //
1310 * Assert annotation a with start == end and an empty tag
1311 * tmpBuff.append(writeEmptyTag(a)); // The annotation is removed from dumped
1312 * set aDumpAnnotList.remove(a); }// End if }else{ // Here the annotation a
1313 * Ends at the offset. // In this case empty the stack and write the end tag
1314 * if (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
1315 * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
1316 * End if tmpBuff.append(writeEndTag(a)); }// End if }else{ // The annotation
1317 * a does NOT end at the offset. Let's see if it starts // at the offset if (
1318 * offset.equals(a.getStartNode().getOffset()) ){ // The annotation a starts
1319 * at the offset. // In this case empty the stack and write the end tag if
1320 * (!stack.isEmpty()){ while(!stack.isEmpty()){ Annotation a1 =
1321 * (Annotation)stack.pop(); tmpBuff.append(writeEndTag(a1)); }// End while }//
1322 * End if tmpBuff.append(writeStartTag(a, includeFeatures)); // The annotation
1323 * is removed from dumped set aDumpAnnotList.remove(a); }// End if (
1324 * offset.equals(a.getStartNode().getOffset()) ) }// End if (
1325 * offset.equals(a.getEndNode().getOffset()) ) }// End while(it.hasNext()){ //
1326 * In this case empty the stack and write the end tag if (!stack.isEmpty()){
1327 * while(!stack.isEmpty()){ Annotation a1 = (Annotation)stack.pop();
1328 * tmpBuff.append(writeEndTag(a1)); }// End while }// End if // Before
1329 * inserting tmpBuff into docContStrBuff we need to check // if there are
1330 * chars to be replaced and if there are, they would be // replaced. if
1331 * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
1332 * offsets2CharsMap.lastKey(); while( !offsets2CharsMap.isEmpty() &&
1333 * offsChar.intValue() >= offset.intValue()){ // Replace the char at offsChar
1334 * with its corresponding entity form // the entitiesMap.
1335 * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1336 * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
1337 * Discard the offsChar after it was used. offsets2CharsMap.remove(offsChar); //
1338 * Investigate next offsChar if (!offsets2CharsMap.isEmpty()) offsChar =
1339 * (Long) offsets2CharsMap.lastKey(); }// End while }// End if // Insert
1340 * tmpBuff to the location where it belongs in docContStrBuff
1341 * docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); }// End
1342 * while(!offsets.isEmpty()) // Need to replace the entities in the remaining
1343 * text, if there is any text // So, if there are any more items in
1344 * offsets2CharsMap they need to be // replaced while
1345 * (!offsets2CharsMap.isEmpty()){ Long offsChar = (Long)
1346 * offsets2CharsMap.lastKey(); // Replace the char with its entity
1347 * docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1348 * (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); //
1349 * remove the offset from the map offsets2CharsMap.remove(offsChar); }// End
1350 * while return docContStrBuff.toString(); }// saveAnnotationSetAsXml()
1351 */
1352 /**
1353 * Return true only if the document has features for original content and
1354 * repositioning information.
1355 */
1356 private boolean hasOriginalContentFeatures() {
1357 FeatureMap features = getFeatures();
1358 boolean result = false;
1359 result = (features
1360 .get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1361 && (features
1362 .get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) != null);
1363 return result;
1364 } // hasOriginalContentFeatures
1365
1366 /**
1367 * This method saves all the annotations from aDumpAnnotSet and combines them
1368 * with the original document content, if preserved as feature.
1369 *
1370 * @param aSourceAnnotationSet
1371 * is a GATE annotation set prepared to be used on the raw text from
1372 * document content. If aDumpAnnotSet is <b>null<b> then an empty
1373 * string will be returned.
1374 * @param includeFeatures
1375 * is a boolean, which controls whether the annotation features and
1376 * gate ID are included or not.
1377 * @return The XML document obtained from raw text + the information from the
1378 * dump annotation set.
1379 */
1380 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1381 boolean includeFeatures) {
1382 StringBuffer docContStrBuff;
1383 String origContent;
1384 origContent = (String)features
1385 .get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1386 if(origContent == null) {
1387 origContent = "";
1388 } // if
1389 long originalContentSize = origContent.length();
1390 RepositioningInfo repositioning = (RepositioningInfo)getFeatures().get(
1391 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1392 docContStrBuff = new StringBuffer(origContent);
1393 if(aSourceAnnotationSet == null) return docContStrBuff.toString();
1394 StatusListener sListener = (StatusListener)gate.Gate
1395 .getListeners().get("gate.event.StatusListener");
1396 AnnotationSet originalMarkupsAnnotSet = this
1397 .getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1398 // Create a dumping annotation set on the document. It will be used for
1399 // dumping annotations...
1400 AnnotationSet dumpingSet = new AnnotationSetImpl((Document)this);
1401 if(sListener != null)
1402 sListener.statusChanged("Constructing the dumping annotation set.");
1403 // Then take all the annotations from aSourceAnnotationSet and verify if
1404 // they can be inserted safely into the dumpingSet. Where not possible,
1405 // report.
1406 if(aSourceAnnotationSet != null) {
1407 Iterator iter = aSourceAnnotationSet.iterator();
1408 Annotation currentAnnot;
1409 while(iter.hasNext()) {
1410 currentAnnot = (Annotation)iter.next();
1411 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1412 && insertsSafety(dumpingSet, currentAnnot)) {
1413 dumpingSet.add(currentAnnot);
1414 } else {
1415 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId()
1416 + ", startOffset=" + currentAnnot.getStartNode().getOffset()
1417 + ", endOffset=" + currentAnnot.getEndNode().getOffset()
1418 + ", type=" + currentAnnot.getType()
1419 + " was found to violate the"
1420 + " crossed over condition. It will be discarded");
1421 }// End if
1422 }// End while
1423 }// End if
1424 // The dumpingSet is ready to be exported as XML
1425 // Here we go.
1426 if(sListener != null)
1427 sListener.statusChanged("Dumping annotations as XML");
1428 // /////////////////////////////////////////
1429 // Construct a set of annot with all IDs in asc order.
1430 // All annotations that end at that offset swap their place in descending
1431 // order. For each node write all the tags from left to right.
1432 // Construct the node set
1433 TreeSet offsets = new TreeSet();
1434 Iterator iter = aSourceAnnotationSet.iterator();
1435 while(iter.hasNext()) {
1436 Annotation annot = (Annotation)iter.next();
1437 offsets.add(annot.getStartNode().getOffset());
1438 offsets.add(annot.getEndNode().getOffset());
1439 }// End while
1440 // ofsets is sorted in ascending order.
1441 // Iterate this set in descending order and remove an offset at each
1442 // iteration
1443 while(!offsets.isEmpty()) {
1444 Long offset = (Long)offsets.last();
1445 // Remove the offset from the set
1446 offsets.remove(offset);
1447 // Now, use it.
1448 // Returns a list with annotations that needs to be serialized in that
1449 // offset.
1450 List annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
1451 // Attention: the annotation are serialized from left to right
1452 StringBuffer tmpBuff = new StringBuffer("");
1453 Stack stack = new Stack();
1454 // Iterate through all these annotations and serialize them
1455 Iterator it = annotations.iterator();
1456 Annotation a = null;
1457 while(it.hasNext()) {
1458 a = (Annotation)it.next();
1459 it.remove();
1460 // Test if a Ends at offset
1461 if(offset.equals(a.getEndNode().getOffset())) {
1462 // Test if a Starts at offset
1463 if(offset.equals(a.getStartNode().getOffset())) {
1464 // Here, the annotation a Starts and Ends at the offset
1465 if(null != a.getFeatures().get("isEmptyAndSpan")
1466 && "true".equals((String)a.getFeatures().get(
1467 "isEmptyAndSpan"))) {
1468 // Assert: annotation a with start == end and isEmptyAndSpan
1469 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1470 stack.push(a);
1471 } else {
1472 // Assert annotation a with start == end and an empty tag
1473 tmpBuff.append(writeEmptyTag(a, false));
1474 // The annotation is removed from dumped set
1475 aSourceAnnotationSet.remove(a);
1476 }// End if
1477 } else {
1478 // Here the annotation a Ends at the offset.
1479 // In this case empty the stack and write the end tag
1480 while(!stack.isEmpty()) {
1481 Annotation a1 = (Annotation)stack.pop();
1482 tmpBuff.append(writeEndTag(a1));
1483 }// End while
1484 tmpBuff.append(writeEndTag(a));
1485 }// End if
1486 } else {
1487 // The annotation a does NOT end at the offset. Let's see if it starts
1488 // at the offset
1489 if(offset.equals(a.getStartNode().getOffset())) {
1490 // The annotation a starts at the offset.
1491 // In this case empty the stack and write the end tag
1492 while(!stack.isEmpty()) {
1493 Annotation a1 = (Annotation)stack.pop();
1494 tmpBuff.append(writeEndTag(a1));
1495 }// End while
1496 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1497 // The annotation is removed from dumped set
1498 aSourceAnnotationSet.remove(a);
1499 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1500 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1501 }// End while(it.hasNext()){
1502 // In this case empty the stack and write the end tag
1503 while(!stack.isEmpty()) {
1504 Annotation a1 = (Annotation)stack.pop();
1505 tmpBuff.append(writeEndTag(a1));
1506 }// End while
1507 long originalPosition = -1;
1508 boolean backPositioning = a != null
1509 && offset.equals(a.getEndNode().getOffset());
1510 if(backPositioning) {
1511 // end of the annotation correction
1512 originalPosition = repositioning
1513 .getOriginalPos(offset.intValue(), true);
1514 } // if
1515 if(originalPosition == -1) {
1516 originalPosition = repositioning.getOriginalPos(offset.intValue());
1517 } // if
1518 // Insert tmpBuff to the location where it belongs in docContStrBuff
1519 if(originalPosition != -1 && originalPosition <= originalContentSize) {
1520 docContStrBuff.insert((int)originalPosition, tmpBuff.toString());
1521 } else {
1522 Out.prln("Error in the repositioning. The offset (" + offset.intValue()
1523 + ") could not be positioned in the original document. \n"
1524 + "Calculated position is: " + originalPosition
1525 + " placed back: " + backPositioning);
1526 } // if
1527 }// End while(!offsets.isEmpty())
1528 if(theRootAnnotation != null)
1529 docContStrBuff.append(writeEndTag(theRootAnnotation));
1530 return docContStrBuff.toString();
1531 } // saveAnnotationSetAsXmlInOrig()
1532
1533 /**
1534 * This method returns a list with annotations ordered that way that they can
1535 * be serialized from left to right, at the offset. If one of the params is
1536 * null then an empty list will be returned.
1537 *
1538 * @param aDumpAnnotSet
1539 * is a set containing all annotations that will be dumped.
1540 * @param offset
1541 * represent the offset at witch the annotation must start AND/OR
1542 * end.
1543 * @return a list with those annotations that need to be serialized.
1544 */
1545 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset) {
1546 List annotationList = new LinkedList();
1547 if(aDumpAnnotSet == null || offset == null) return annotationList;
1548 Set annotThatStartAtOffset = new TreeSet(new AnnotationComparator(
1549 ORDER_ON_END_OFFSET, DESC));
1550 Set annotThatEndAtOffset = new TreeSet(new AnnotationComparator(
1551 ORDER_ON_START_OFFSET, DESC));
1552 Set annotThatStartAndEndAtOffset = new TreeSet(new AnnotationComparator(
1553 ORDER_ON_ANNOT_ID, ASC));
1554 // Fill these tree lists with annotation tat start, end or start and
1555 // end at the offset.
1556 Iterator iter = aDumpAnnotSet.iterator();
1557 while(iter.hasNext()) {
1558 Annotation ann = (Annotation)iter.next();
1559 if(offset.equals(ann.getStartNode().getOffset())) {
1560 if(offset.equals(ann.getEndNode().getOffset()))
1561 annotThatStartAndEndAtOffset.add(ann);
1562 else annotThatStartAtOffset.add(ann);
1563 } else {
1564 if(offset.equals(ann.getEndNode().getOffset()))
1565 annotThatEndAtOffset.add(ann);
1566 }// End if
1567 }// End while
1568 annotationList.addAll(annotThatEndAtOffset);
1569 annotThatEndAtOffset = null;
1570 annotationList.addAll(annotThatStartAtOffset);
1571 annotThatStartAtOffset = null;
1572 iter = annotThatStartAndEndAtOffset.iterator();
1573 while(iter.hasNext()) {
1574 Annotation ann = (Annotation)iter.next();
1575 Iterator it = annotationList.iterator();
1576 boolean breaked = false;
1577 while(it.hasNext()) {
1578 Annotation annFromList = (Annotation)it.next();
1579 if(annFromList.getId().intValue() > ann.getId().intValue()) {
1580 annotationList.add(annotationList.indexOf(annFromList), ann);
1581 breaked = true;
1582 break;
1583 }// End if
1584 }// End while
1585 if(!breaked) annotationList.add(ann);
1586 iter.remove();
1587 }// End while
1588 return annotationList;
1589 }// getAnnotationsForOffset()
1590
1591 private List getAnnotationsForOffset(List aDumpAnnotList, Long offset) {
1592 List annotationList = new ArrayList();
1593 if(aDumpAnnotList == null || offset == null) return annotationList;
1594 Set annotThatStartAtOffset;
1595 Set annotThatEndAtOffset;
1596 Set annotThatStartAndEndAtOffset;
1597 annotThatStartAtOffset = new TreeSet(new AnnotationComparator(
1598 ORDER_ON_END_OFFSET, DESC));
1599 annotThatEndAtOffset = new TreeSet(new AnnotationComparator(
1600 ORDER_ON_START_OFFSET, DESC));
1601 annotThatStartAndEndAtOffset = new TreeSet(new AnnotationComparator(
1602 ORDER_ON_ANNOT_ID, ASC));
1603 // Fill these tree lists with annotation tat start, end or start and
1604 // end at the offset.
1605 Iterator iter = aDumpAnnotList.iterator();
1606 while(iter.hasNext()) {
1607 Annotation ann = (Annotation)iter.next();
1608 if(offset.equals(ann.getStartNode().getOffset())) {
1609 if(offset.equals(ann.getEndNode().getOffset()))
1610 annotThatStartAndEndAtOffset.add(ann);
1611 else annotThatStartAtOffset.add(ann);
1612 } else {
1613 if(offset.equals(ann.getEndNode().getOffset()))
1614 annotThatEndAtOffset.add(ann);
1615 }// End if
1616 }// End while
1617 annotationList.addAll(annotThatEndAtOffset);
1618 annotationList.addAll(annotThatStartAtOffset);
1619 annotThatEndAtOffset = null;
1620 annotThatStartAtOffset = null;
1621 iter = annotThatStartAndEndAtOffset.iterator();
1622 while(iter.hasNext()) {
1623 Annotation ann = (Annotation)iter.next();
1624 Iterator it = annotationList.iterator();
1625 boolean breaked = false;
1626 while(it.hasNext()) {
1627 Annotation annFromList = (Annotation)it.next();
1628 if(annFromList.getId().intValue() > ann.getId().intValue()) {
1629 annotationList.add(annotationList.indexOf(annFromList), ann);
1630 breaked = true;
1631 break;
1632 }// End if
1633 }// End while
1634 if(!breaked) annotationList.add(ann);
1635 iter.remove();
1636 }// End while
1637 return annotationList;
1638 }// getAnnotationsForOffset()
1639
1640 private String writeStartTag(Annotation annot, boolean includeFeatures) {
1641 return writeStartTag(annot, includeFeatures, true);
1642 } // writeStartTag
1643
1644 /** Returns a string representing a start tag based on the input annot */
1645 private String writeStartTag(Annotation annot, boolean includeFeatures,
1646 boolean includeNamespace) {
1647
1648 // Get the annot feature used to store the namespace prefix, if it
1649 // has been defined
1650 String nsPrefix = null;
1651
1652 if (serializeNamespaceInfo)
1653 nsPrefix = (String)annot.getFeatures().get(namespacePrefixFeature);
1654
1655 AnnotationSet originalMarkupsAnnotSet = this
1656 .getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1657 StringBuffer strBuff = new StringBuffer("");
1658 if(annot == null) return strBuff.toString();
1659 // if (!addGatePreserveFormatTag && isRootTag){
1660 if(theRootAnnotation != null
1661 && annot.getId().equals(theRootAnnotation.getId())) {
1662 // the features are included either if desired or if that's an annotation
1663 // from the original markup of the document. We don't want for example to
1664 // spoil all links in an HTML file!
1665 if(includeFeatures) {
1666 strBuff.append("<");
1667 if (nsPrefix != null && !nsPrefix.isEmpty())
1668 strBuff.append(nsPrefix + ":");
1669 strBuff.append(annot.getType());
1670 strBuff.append(" ");
1671 if(includeNamespace) {
1672 // but don't add the gate ns declaration if it's already there!
1673 if (annot.getFeatures().get("xmlns:gate") == null)
1674 strBuff.append("xmlns:gate=\"http://www.gate.ac.uk\"");
1675 strBuff.append(" gate:");
1676 }
1677 strBuff.append("gateId=\"");
1678 strBuff.append(annot.getId());
1679 strBuff.append("\"");
1680 strBuff.append(" ");
1681 if(includeNamespace) {
1682 strBuff.append("gate:");
1683 }
1684 strBuff.append("annotMaxId=\"");
1685 strBuff.append(nextAnnotationId);
1686 strBuff.append("\"");
1687 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1688 strBuff.append(">");
1689 } else if(originalMarkupsAnnotSet.contains(annot)) {
1690 strBuff.append("<");
1691 if (nsPrefix != null && !nsPrefix.isEmpty())
1692 strBuff.append(nsPrefix + ":");
1693 strBuff.append(annot.getType());
1694 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1695 strBuff.append(">");
1696 } else {
1697 strBuff.append("<");
1698 if (nsPrefix != null && !nsPrefix.isEmpty())
1699 strBuff.append(nsPrefix + ":");
1700 strBuff.append(annot.getType());
1701 strBuff.append(">");
1702 }
1703 } else {
1704 // the features are included either if desired or if that's an annotation
1705 // from the original markup of the document. We don't want for example to
1706 // spoil all links in an HTML file!
1707 if(includeFeatures) {
1708 strBuff.append("<");
1709 if (nsPrefix != null && !nsPrefix.isEmpty())
1710 strBuff.append(nsPrefix + ":");
1711 strBuff.append(annot.getType());
1712 strBuff.append(" ");
1713 if(includeNamespace) {
1714 strBuff.append("gate:");
1715 } // if includeNamespaces
1716 strBuff.append("gateId=\"");
1717 strBuff.append(annot.getId());
1718 strBuff.append("\"");
1719 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1720 strBuff.append(">");
1721 } else if(originalMarkupsAnnotSet.contains(annot)) {
1722 strBuff.append("<");
1723 if (nsPrefix != null && !nsPrefix.isEmpty())
1724 strBuff.append(nsPrefix + ":");
1725 strBuff.append(annot.getType());
1726 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1727 strBuff.append(">");
1728 } else {
1729 strBuff.append("<");
1730 if (nsPrefix != null && !nsPrefix.isEmpty())
1731 strBuff.append(nsPrefix + ":");
1732 strBuff.append(annot.getType());
1733 strBuff.append(">");
1734 }
1735 }// End if
1736 return strBuff.toString();
1737 }// writeStartTag()
1738
1739 /**
1740 * Identifies the root annotations inside an annotation set. The root
1741 * annotation is the one that starts at offset 0, and has the greatest span.
1742 * If there are more than one with this function, then the annotation with the
1743 * smalled ID wil be selected as root. If none is identified it will return
1744 * null.
1745 *
1746 * @param anAnnotationSet
1747 * The annotation set possibly containing the root annotation.
1748 * @return The root annotation or null is it fails
1749 */
1750 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet) {
1751 if(anAnnotationSet == null) return null;
1752 // If the starting node of this annotation is not null, then the annotation
1753 // set will not have a root annotation.
1754 Node startNode = anAnnotationSet.firstNode();
1755 Node endNode = anAnnotationSet.lastNode();
1756 // This is placed here just to speed things up. The alghorithm bellow can
1757 // can identity the annotation that span over the entire set and with the
1758 // smallest ID. However the root annotation will have to have the start
1759 // offset equal to 0.
1760 if(startNode.getOffset().longValue() != 0) return null;
1761 // Go anf find the annotation.
1762 Annotation theRootAnnotation = null;
1763 // Check if there are annotations starting at offset 0. If there are, then
1764 // check all of them to see which one has the greatest span. Basically its
1765 // END offset should be the bigest offset from the input annotation set.
1766 long start = startNode.getOffset().longValue();
1767 long end = endNode.getOffset().longValue();
1768 for(Iterator<Annotation> it = anAnnotationSet.iterator(); it.hasNext();) {
1769 Annotation currentAnnot = it.next();
1770 // If the currentAnnot has both its Start and End equals to the Start and
1771 // end of the AnnotationSet then check to see if its ID is the smallest.
1772 if((start == currentAnnot.getStartNode().getOffset().longValue())
1773 && (end == currentAnnot.getEndNode().getOffset().longValue())) {
1774 // The currentAnnotation has is a potencial root one.
1775 if(theRootAnnotation == null)
1776 theRootAnnotation = currentAnnot;
1777 else {
1778 // If its ID is greater that the currentAnnot then update the root
1779 if(theRootAnnotation.getId().intValue() > currentAnnot.getId()
1780 .intValue()) theRootAnnotation = currentAnnot;
1781 }// End if
1782 }// End if
1783 }// End for
1784 return theRootAnnotation;
1785 }// End identifyTheRootAnnotation()
1786
1787 private Annotation identifyTheRootAnnotation(List anAnnotationList) {
1788 if(anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1789 // If the first annotation in the list (which is sorted by start offset)
1790 // does not have an offset = 0, then there's no root tag.
1791 if(((Annotation)anAnnotationList.get(0)).getStartNode().getOffset()
1792 .longValue() > 0) return null;
1793 // If there's a single annotation and it starts at the start (which we
1794 // already know it does), make sure it ends at the end.
1795 if(anAnnotationList.size() == 1) {
1796 Annotation onlyAnn = (Annotation)anAnnotationList.get(0);
1797 if(onlyAnn.getEndNode().getOffset().equals(content.size()))
1798 return onlyAnn;
1799 return null;
1800 }
1801 // find the limits
1802 long start = 0; // we know this already
1803 long end = 0; // end = 0 will be improved by the next loop
1804 for(int i = 0; i < anAnnotationList.size(); i++) {
1805 Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1806 long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1807 if(localEnd > end) end = localEnd;
1808 }
1809 // Go and find the annotation.
1810 // look at all annotations that start at 0 and end at end
1811 // if there are several, choose the one with the smallest ID
1812 Annotation theRootAnnotation = null;
1813 for(int i = 0; i < anAnnotationList.size(); i++) {
1814 Annotation currentAnnot = (Annotation)anAnnotationList.get(i);
1815 long localStart = currentAnnot.getStartNode().getOffset().longValue();
1816 long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1817 // If the currentAnnot has both its Start and End equals to the Start and
1818 // end of the AnnotationSet then check to see if its ID is the smallest.
1819 if((start == localStart) && (end == localEnd)) {
1820 // The currentAnnotation has is a potential root one.
1821 if(theRootAnnotation == null)
1822 theRootAnnotation = currentAnnot;
1823 else {
1824 // If root's ID is greater that the currentAnnot then update the root
1825 if(theRootAnnotation.getId().intValue() > currentAnnot.getId()
1826 .intValue()) theRootAnnotation = currentAnnot;
1827 }// End if
1828 }// End if
1829 }// End for
1830 return theRootAnnotation;
1831 }// End identifyTheRootAnnotation()
1832
1833 /**
1834 * This method takes aScanString and searches for those chars from entitiesMap
1835 * that appear in the string. A tree map(offset2Char) is filled using as key
1836 * the offsets where those Chars appear and the Char. If one of the params is
1837 * null the method simply returns.
1838 */
1839 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill) {
1840 if(aScanString == null || aMapToFill == null) return;
1841 if(DocumentXmlUtils.entitiesMap == null || DocumentXmlUtils.entitiesMap.isEmpty()) {
1842 Err.prln("WARNING: Entities map was not initialised !");
1843 return;
1844 }// End if
1845 // Fill the Map with the offsets of the special chars
1846 Iterator entitiesMapIterator = DocumentXmlUtils.entitiesMap.keySet().iterator();
1847 Character c;
1848 int fromIndex;
1849 while(entitiesMapIterator.hasNext()) {
1850 c = (Character)entitiesMapIterator.next();
1851 fromIndex = 0;
1852 while(-1 != fromIndex) {
1853 fromIndex = aScanString.indexOf(c.charValue(), fromIndex);
1854 if(-1 != fromIndex) {
1855 aMapToFill.put(new Long(fromIndex), c);
1856 fromIndex++;
1857 }// End if
1858 }// End while
1859 }// End while
1860 }// buildEntityMapFromString();
1861
1862 private String writeEmptyTag(Annotation annot) {
1863 return writeEmptyTag(annot, true);
1864 } // writeEmptyTag
1865
1866
1867 /** Returns a string representing an empty tag based on the input annot */
1868 private String writeEmptyTag(Annotation annot, boolean includeNamespace) {
1869 // Get the annot feature used to store the namespace prefix, if it
1870 // has been defined
1871 String nsPrefix = null;
1872 if (serializeNamespaceInfo)
1873 nsPrefix = (String)annot.getFeatures().get(namespacePrefixFeature);
1874
1875 StringBuffer strBuff = new StringBuffer("");
1876 if(annot == null) return strBuff.toString();
1877 strBuff.append("<");
1878 if (nsPrefix != null && !nsPrefix.isEmpty())
1879 strBuff.append(nsPrefix + ":");
1880 strBuff.append(annot.getType());
1881 AnnotationSet originalMarkupsAnnotSet = this
1882 .getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1883 if(!originalMarkupsAnnotSet.contains(annot)) {
1884 strBuff.append(" gateId=\"");
1885 strBuff.append(annot.getId());
1886 strBuff.append("\"");
1887 }
1888 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1889 strBuff.append("/>");
1890 return strBuff.toString();
1891 }// writeEmptyTag()
1892
1893 /** Returns a string representing an end tag based on the input annot */
1894 private String writeEndTag(Annotation annot) {
1895 // Get the annot feature used to store the namespace prefix, if it
1896 // has been defined
1897 String nsPrefix = null;
1898 if (serializeNamespaceInfo)
1899 nsPrefix = (String)annot.getFeatures().get(namespacePrefixFeature);
1900
1901 StringBuffer strBuff = new StringBuffer("");
1902 if(annot == null) return strBuff.toString();
1903 /*
1904 * if (annot.getType().indexOf(" ") != -1) Out.prln("Warning: Truncating end
1905 * tag to first word for annot type \"" +annot.getType()+ "\". ");
1906 */
1907 strBuff.append("</");
1908 if (nsPrefix != null && !nsPrefix.isEmpty())
1909 strBuff.append(nsPrefix + ":");
1910 strBuff.append(annot.getType() + ">");
1911 return strBuff.toString();
1912 }// writeEndTag()
1913
1914 /** Returns a string representing a FeatureMap serialized as XML attributes */
1915 private String writeFeatures(FeatureMap feat, boolean includeNamespace) {
1916 StringBuffer strBuff = new StringBuffer("");
1917 if(feat == null) return strBuff.toString();
1918 Iterator it = feat.keySet().iterator();
1919 while(it.hasNext()) {
1920 Object key = it.next();
1921 Object value = feat.get(key);
1922 if((key != null) && (value != null)) {
1923 /**
1924 * Eliminate namespace prefix feature and rename namespace uri feature
1925 * to xmlns:prefix=uri
1926 * if these have been specified in the markup and in the config
1927 */
1928 if (serializeNamespaceInfo) {
1929 String nsPrefix = "xmlns:" + (String)feat.get(namespacePrefixFeature);
1930
1931 if (nsPrefix.equals(key.toString())) continue;
1932 if (namespacePrefixFeature.equals(key.toString())) continue;
1933
1934 if (namespaceURIFeature.equals(key.toString())) {
1935 strBuff.append(" ");
1936 strBuff.append(nsPrefix + "=\"" + value.toString() + "\"");
1937 return strBuff.toString();
1938 }
1939 }
1940 // Eliminate a feature inserted at reading time and which help to
1941 // take some decissions at saving time
1942 if("isEmptyAndSpan".equals(key.toString())) continue;
1943 if(!(String.class.isAssignableFrom(key.getClass()) || Number.class
1944 .isAssignableFrom(key.getClass()))) {
1945 Out.prln("Warning:Found a feature NAME(" + key
1946 + ") that doesn't came"
1947 + " from String or Number.(feature discarded)");
1948 continue;
1949 }// End if
1950 if(!(String.class.isAssignableFrom(value.getClass())
1951 || Number.class.isAssignableFrom(value.getClass()) || java.util.Collection.class
1952 .isAssignableFrom(value.getClass()))) {
1953 Out.prln("Warning:Found a feature VALUE(" + value
1954 + ") that doesn't came"
1955 + " from String, Number or Collection.(feature discarded)");
1956 continue;
1957 }// End if
1958 if("matches".equals(key)) {
1959 strBuff.append(" ");
1960 if(includeNamespace) {
1961 strBuff.append("gate:");
1962 }
1963 // strBuff.append(key);
1964 // replace non XML chars in attribute name
1965 strBuff.append(DocumentXmlUtils.combinedNormalisation(key
1966 .toString()));
1967 strBuff.append("=\"");
1968 } else {
1969 strBuff.append(" ");
1970 // strBuff.append(key);
1971 // replace non XML chars in attribute name
1972 strBuff.append(DocumentXmlUtils.combinedNormalisation(key
1973 .toString()));
1974 strBuff.append("=\"");
1975 }
1976 if(java.util.Collection.class.isAssignableFrom(value.getClass())) {
1977 Iterator valueIter = ((Collection)value).iterator();
1978 while(valueIter.hasNext()) {
1979 Object item = valueIter.next();
1980 if(!(String.class.isAssignableFrom(item.getClass()) || Number.class
1981 .isAssignableFrom(item.getClass()))) continue;
1982 // strBuff.append(item);
1983 // replace non XML chars in collection item
1984 strBuff.append(DocumentXmlUtils.combinedNormalisation(item
1985 .toString()));
1986 strBuff.append(";");
1987 }// End while
1988 if(strBuff.charAt(strBuff.length() - 1) == ';')
1989 strBuff.deleteCharAt(strBuff.length() - 1);
1990 } else {
1991 // strBuff.append(value);
1992 // replace non XML chars in attribute value
1993 strBuff.append(DocumentXmlUtils.combinedNormalisation(value
1994 .toString()));
1995 }// End if
1996 strBuff.append("\"");
1997 }// End if
1998 }// End while
1999 return strBuff.toString();
2000 }// writeFeatures()
2001
2002 /**
2003 * Returns a GateXml document that is a custom XML format for wich there is a
2004 * reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
2005 * serialize a GATE document in an XML format.
2006 *
2007 * Implementation note: this method simply delegates to the static {@link
2008 * DocumentStaxUtils#toXml(gate.Document)} method
2009 *
2010 * @return a string representing a Gate Xml document.
2011 */
2012 public String toXml() {
2013 return DocumentStaxUtils.toXml(this);
2014 //return DocumentXmlUtils.toXml(this);
2015 }// toXml
2016
2017 /**
2018 * This method saves an AnnotationSet as XML.
2019 * May be removed once GLEAM doc-service is updated.
2020 *
2021 * @param anAnnotationSet
2022 * The annotation set that has to be saved as XML.
2023 * @return a String like this: <AnnotationSet> <Annotation>....
2024 * </AnnotationSet>
2025 */
2026 private void annotationSetToXml(AnnotationSet anAnnotationSet,
2027 StringBuffer buffer) {
2028 DocumentXmlUtils.annotationSetToXml(anAnnotationSet, buffer);
2029 }// annotationSetToXml
2030
2031 /**
2032 * Returns a map with the named annotation sets. It returns <code>null</code>
2033 * if no named annotaton set exists.
2034 */
2035 public Map<String, AnnotationSet> getNamedAnnotationSets() {
2036 return namedAnnotSets;
2037 } // getNamedAnnotationSets
2038
2039 public Set<String> getAnnotationSetNames() {
2040 return (namedAnnotSets == null) ? null : namedAnnotSets.keySet();
2041 }
2042
2043 /**
2044 * Removes one of the named annotation sets. Note that the default annotation
2045 * set cannot be removed.
2046 *
2047 * @param name
2048 * the name of the annotation set to be removed
2049 */
2050 public void removeAnnotationSet(String name) {
2051 if(namedAnnotSets != null) {
2052 AnnotationSet removed = namedAnnotSets.remove(name);
2053 if(removed != null) {
2054 fireAnnotationSetRemoved(new DocumentEvent(this,
2055 DocumentEvent.ANNOTATION_SET_REMOVED, name));
2056 }
2057 }
2058 }
2059
2060 /** Propagate edit changes to the document content and annotations. */
2061 public void edit(Long start, Long end, DocumentContent replacement)
2062 throws InvalidOffsetException {
2063 if(!isValidOffsetRange(start, end)) throw new InvalidOffsetException("Offsets: "+start+"/"+end);
2064 if(content != null)
2065 ((DocumentContentImpl)content).edit(start, end, replacement);
2066 if(defaultAnnots != null)
2067 ((AnnotationSetImpl)defaultAnnots).edit(start, end, replacement);
2068 if(namedAnnotSets != null) {
2069 Iterator iter = namedAnnotSets.values().iterator();
2070 while(iter.hasNext())
2071 ((AnnotationSetImpl)iter.next()).edit(start, end, replacement);
2072 }
2073 // let the listeners know
2074 fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2075 start, end));
2076 } // edit(start,end,replacement)
2077
2078 /**
2079 * Check that an offset is valid, i.e. it is non-null, greater than or equal
2080 * to 0 and less than the size of the document content.
2081 */
2082 public boolean isValidOffset(Long offset) {
2083 if(offset == null) return false;
2084 long o = offset.longValue();
2085 if(o > getContent().size().longValue() || o < 0) return false;
2086 return true;
2087 } // isValidOffset
2088
2089 /**
2090 * Check that both start and end are valid offsets and that they constitute a
2091 * valid offset range, i.e. start is greater than or equal to long.
2092 */
2093 public boolean isValidOffsetRange(Long start, Long end) {
2094 return isValidOffset(start) && isValidOffset(end)
2095 && start.longValue() <= end.longValue();
2096 } // isValidOffsetRange(start,end)
2097
2098 /** Sets the nextAnnotationId */
2099 public void setNextAnnotationId(int aNextAnnotationId) {
2100 nextAnnotationId = aNextAnnotationId;
2101 }// setNextAnnotationId();
2102
2103 /** Generate and return the next annotation ID */
2104 public Integer getNextAnnotationId() {
2105 return new Integer(nextAnnotationId++);
2106 } // getNextAnnotationId
2107
2108 /** Generate and return the next node ID */
2109 public Integer getNextNodeId() {
2110 return new Integer(nextNodeId++);
2111 }
2112
2113 /** Ordering based on URL.toString() and the URL offsets (if any) */
2114 public int compareTo(Object o) throws ClassCastException {
2115 DocumentImpl other = (DocumentImpl)o;
2116 return getOrderingString().compareTo(other.getOrderingString());
2117 } // compareTo
2118
2119 /**
2120 * Utility method to produce a string for comparison in ordering. String is
2121 * based on the source URL and offsets.
2122 */
2123 protected String getOrderingString() {
2124 if(sourceUrl == null) return toString();
2125 StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2126 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2127 orderingString.append(sourceUrlStartOffset.toString());
2128 orderingString.append(sourceUrlEndOffset.toString());
2129 }
2130 return orderingString.toString();
2131 } // getOrderingString()
2132
2133 /** The id of the next new annotation */
2134 protected int nextAnnotationId = 0;
2135
2136 /** The id of the next new node */
2137 protected int nextNodeId = 0;
2138
2139 /** The source URL */
2140 protected URL sourceUrl;
2141
2142 /** The document's MIME type. Only relevant if the document is markup aware,
2143 * and if omitted, DocumentFormat will attempt to determine the format to use
2144 * heuristically.
2145 */
2146 protected String mimeType;
2147
2148 /** The document's URL name. */
2149 /** The content of the document */
2150 protected DocumentContent content;
2151
2152 /** The encoding of the source of the document content */
2153 protected String encoding = null;
2154
2155 // Data needed in toXml(AnnotationSet) methos
2156 /**
2157 * This field indicates whether or not to add the tag called
2158 * GatePreserveFormat to the document. HTML, XML, SGML docs won't have this
2159 * tag added
2160 */
2161 // private boolean addGatePreserveFormatTag = false;
2162 /**
2163 * Used by the XML dump preserving format method
2164 */
2165 private Annotation theRootAnnotation = null;
2166
2167 /**
2168 * This field is used when creating StringBuffers for saveAnnotationSetAsXML()
2169 * methods. The size of the StringBuffer will be docDonctent.size() multiplied
2170 * by this value. It is aimed to improve the performance of StringBuffer
2171 */
2172 private static final int DOC_SIZE_MULTIPLICATION_FACTOR_AS = 3;
2173
2174 /**
2175 * Constant used in the inner class AnnotationComparator to order annotations
2176 * on their start offset
2177 */
2178 private final int ORDER_ON_START_OFFSET = 0;
2179
2180 /**
2181 * Constant used in the inner class AnnotationComparator to order annotations
2182 * on their end offset
2183 */
2184 private final int ORDER_ON_END_OFFSET = 1;
2185
2186 /**
2187 * Constant used in the inner class AnnotationComparator to order annotations
2188 * on their ID
2189 */
2190 private final int ORDER_ON_ANNOT_ID = 2;
2191
2192 /**
2193 * Constant used in the inner class AnnotationComparator to order annotations
2194 * ascending
2195 */
2196 private final int ASC = 3;
2197
2198 /**
2199 * Constant used in the inner class AnnotationComparator to order annotations
2200 * descending
2201 */
2202 private final int DESC = -3;
2203
2204 /**
2205 * The range that the content comes from at the source URL (or null if none).
2206 */
2207 // protected Long[] sourceUrlOffsets;
2208 /**
2209 * The start of the range that the content comes from at the source URL (or
2210 * null if none).
2211 */
2212 protected Long sourceUrlStartOffset;
2213
2214 /**
2215 * The end of the range that the content comes from at the source URL (or null
2216 * if none).
2217 */
2218 protected Long sourceUrlEndOffset;
2219
2220 /** The default annotation set */
2221 protected AnnotationSet defaultAnnots;
2222
2223 /** Named sets of annotations */
2224 protected Map<String, AnnotationSet> namedAnnotSets;
2225
2226 /**
2227 * A property of the document that will be set when the user wants to create
2228 * the document from a string, as opposed to from a URL.
2229 */
2230 private String stringContent;
2231
2232 /**
2233 * The stringContent of a document is a property of the document that will be
2234 * set when the user wants to create the document from a string, as opposed to
2235 * from a URL. <B>Use the <TT>getContent</TT> method instead to get the
2236 * actual document content.</B>
2237 */
2238 public String getStringContent() {
2239 return stringContent;
2240 }
2241
2242 /**
2243 * The stringContent of a document is a property of the document that will be
2244 * set when the user wants to create the document from a string, as opposed to
2245 * from a URL. <B>Use the <TT>setContent</TT> method instead to update the
2246 * actual document content.</B>
2247 */
2248 @CreoleParameter(disjunction = "source", priority = 2,
2249 comment = "The content of the document")
2250 public void setStringContent(String stringContent) {
2251 this.stringContent = stringContent;
2252 } // set StringContent
2253
2254 /** Is the document markup-aware? */
2255 protected Boolean markupAware = new Boolean(false);
2256
2257 // /** Hash code */
2258 // public int hashCode() {
2259 // int code = getContent().hashCode();
2260 // int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2261 // code += memberCode;
2262 // memberCode = (encoding == null) ? 0 : encoding.hashCode();
2263 // code += memberCode;
2264 // memberCode = (features == null) ? 0 : features.hashCode();
2265 // code += memberCode;
2266 // code += (markupAware.booleanValue()) ? 0 : 1;
2267 // memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2268 // code += memberCode;
2269 // code += nextAnnotationId;
2270 // code += nextNodeId;
2271 // memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2272 // code += memberCode;
2273 // memberCode =
2274 // (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2275 // code += memberCode;
2276 // memberCode =
2277 // (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2278 // code += memberCode;
2279 // return code;
2280 // } // hashcode
2281 /** String respresentation */
2282 public String toString() {
2283 String n = Strings.getNl();
2284 StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2285 s.append(" content:" + content + n);
2286 s.append(" defaultAnnots:" + defaultAnnots + n);
2287 s.append(" encoding:" + encoding + n);
2288 s.append(" features:" + features + n);
2289 s.append(" markupAware:" + markupAware + n);
2290 s.append(" namedAnnotSets:" + namedAnnotSets + n);
2291 s.append(" nextAnnotationId:" + nextAnnotationId + n);
2292 s.append(" nextNodeId:" + nextNodeId + n);
2293 s.append(" sourceUrl:" + sourceUrl + n);
2294 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2295 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2296 s.append(n);
2297 return s.toString();
2298 } // toString
2299
2300 /** Freeze the serialization UID. */
2301 static final long serialVersionUID = -8456893608311510260L;
2302
2303 /** Inner class needed to compare annotations */
2304 class AnnotationComparator implements java.util.Comparator {
2305 int orderOn = -1;
2306
2307 int orderType = ASC;
2308
2309 /**
2310 * Constructs a comparator according to one of three sorter types:
2311 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2312 */
2313 public AnnotationComparator(int anOrderOn, int anOrderType) {
2314 orderOn = anOrderOn;
2315 orderType = anOrderType;
2316 }// AnnotationComparator()
2317
2318 /** This method must be implemented according to Comparator interface */
2319 public int compare(Object o1, Object o2) {
2320 Annotation a1 = (Annotation)o1;
2321 Annotation a2 = (Annotation)o2;
2322 // ORDER_ON_START_OFFSET ?
2323 if(orderOn == ORDER_ON_START_OFFSET) {
2324 int result = a1.getStartNode().getOffset().compareTo(
2325 a2.getStartNode().getOffset());
2326 if(orderType == ASC) {
2327 // ASC
2328 // If they are equal then their ID will decide.
2329 if(result == 0) return a1.getId().compareTo(a2.getId());
2330 return result;
2331 } else {
2332 // DESC
2333 if(result == 0) return -(a1.getId().compareTo(a2.getId()));
2334 return -result;
2335 }// End if (orderType == ASC)
2336 }// End if (orderOn == ORDER_ON_START_OFFSET)
2337 // ORDER_ON_END_OFFSET ?
2338 if(orderOn == ORDER_ON_END_OFFSET) {
2339 int result = a1.getEndNode().getOffset().compareTo(
2340 a2.getEndNode().getOffset());
2341 if(orderType == ASC) {
2342 // ASC
2343 // If they are equal then their ID will decide.
2344 if(result == 0) return -(a1.getId().compareTo(a2.getId()));
2345 return result;
2346 } else {
2347 // DESC
2348 // If they are equal then their ID will decide.
2349 if(result == 0) return a1.getId().compareTo(a2.getId());
2350 return -result;
2351 }// End if (orderType == ASC)
2352 }// End if (orderOn == ORDER_ON_END_OFFSET)
2353 // ORDER_ON_ANNOT_ID ?
2354 if(orderOn == ORDER_ON_ANNOT_ID) {
2355 if(orderType == ASC)
2356 return a1.getId().compareTo(a2.getId());
2357 else return -(a1.getId().compareTo(a2.getId()));
2358 }// End if
2359 return 0;
2360 }// compare()
2361 } // End inner class AnnotationComparator
2362
2363 private transient Vector documentListeners;
2364
2365 private transient Vector gateListeners;
2366
2367 public synchronized void removeDocumentListener(DocumentListener l) {
2368 if(documentListeners != null && documentListeners.contains(l)) {
2369 Vector v = (Vector)documentListeners.clone();
2370 v.removeElement(l);
2371 documentListeners = v;
2372 }
2373 }
2374
2375 public synchronized void addDocumentListener(DocumentListener l) {
2376 Vector v = documentListeners == null
2377 ? new Vector(2)
2378 : (Vector)documentListeners.clone();
2379 if(!v.contains(l)) {
2380 v.addElement(l);
2381 documentListeners = v;
2382 }
2383 }
2384
2385 protected void fireAnnotationSetAdded(DocumentEvent e) {
2386 if(documentListeners != null) {
2387 Vector listeners = documentListeners;
2388 int count = listeners.size();
2389 for(int i = 0; i < count; i++) {
2390 ((DocumentListener)listeners.elementAt(i)).annotationSetAdded(e);
2391 }
2392 }
2393 }
2394
2395 protected void fireAnnotationSetRemoved(DocumentEvent e) {
2396 if(documentListeners != null) {
2397 Vector listeners = documentListeners;
2398 int count = listeners.size();
2399 for(int i = 0; i < count; i++) {
2400 ((DocumentListener)listeners.elementAt(i)).annotationSetRemoved(e);
2401 }
2402 }
2403 }
2404
2405 protected void fireContentEdited(DocumentEvent e) {
2406 if(documentListeners != null) {
2407 Vector listeners = documentListeners;
2408 int count = listeners.size();
2409 for(int i = 0; i < count; i++) {
2410 ((DocumentListener)listeners.elementAt(i)).contentEdited(e);
2411 }
2412 }
2413 }
2414
2415 public void resourceLoaded(CreoleEvent e) {
2416 }
2417
2418 public void resourceUnloaded(CreoleEvent e) {
2419 }
2420
2421 public void datastoreOpened(CreoleEvent e) {
2422 }
2423
2424 public void datastoreCreated(CreoleEvent e) {
2425 }
2426
2427 public void resourceRenamed(Resource resource, String oldName, String newName) {
2428 }
2429
2430 public void datastoreClosed(CreoleEvent e) {
2431 if(!e.getDatastore().equals(this.getDataStore())) return;
2432 // close this lr, since it cannot stay open when the DS it comes from
2433 // is closed
2434 Factory.deleteResource(this);
2435 }
2436
2437 public void setLRPersistenceId(Object lrID) {
2438 super.setLRPersistenceId(lrID);
2439 // make persistent documents listen to the creole register
2440 // for events about their DS
2441 Gate.getCreoleRegister().addCreoleListener(this);
2442 }
2443
2444 public void resourceAdopted(DatastoreEvent evt) {
2445 }
2446
2447 public void resourceDeleted(DatastoreEvent evt) {
2448 if(!evt.getSource().equals(this.getDataStore())) return;
2449 // if an open document is deleted from a DS, then
2450 // it must close itself immediately, as is no longer valid
2451 if(evt.getResourceID().equals(this.getLRPersistenceId()))
2452 Factory.deleteResource(this);
2453 }
2454
2455 public void resourceWritten(DatastoreEvent evt) {
2456 }
2457
2458 public void setDataStore(DataStore dataStore)
2459 throws gate.persist.PersistenceException {
2460 super.setDataStore(dataStore);
2461 if(this.dataStore != null) this.dataStore.addDatastoreListener(this);
2462 }
2463
2464 /**
2465 * This method added by Shafirin Andrey, to allow access to protected member
2466 * {@link #defaultAnnots} Required for JAPE-Debugger.
2467 */
2468 public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2469 defaultAnnots = defaultAnnotations;
2470 }
2471 } // class DocumentImpl
|