001 /*
002 * EntityDescriptor.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Valentin Tablan, July/2000
013 *
014 * $Id: EntityDescriptor.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.creole.nerc;
018
019 import java.io.Serializable;
020
021 import gate.Annotation;
022 import gate.Document;
023 import gate.util.InvalidOffsetException;
024
025 /** Represents a single named entity */
026 public class EntityDescriptor implements Serializable{
027
028 /** Constructs a new entity descriptor */
029 public EntityDescriptor(String string, String category, int start, int end) {
030 this.string = normaliseString(string);
031 this.category = category;
032 offsets = new int[2];
033 offsets[0] = start;
034 offsets[1] = end;
035 }
036
037 /** Constructs a new entity descriptor starting from a Gate annotation */
038 public EntityDescriptor(Document document, Annotation annotation) {
039 offsets = new int[2];
040 offsets[0] = annotation.getStartNode().getOffset().intValue();
041 offsets[1] = annotation.getEndNode().getOffset().intValue();
042 try{
043 string = normaliseString(document.getContent().getContent(
044 annotation.getStartNode().getOffset(),
045 annotation.getEndNode().getOffset()).
046 toString());
047 } catch(InvalidOffsetException ioe){
048 ioe.printStackTrace();
049 }
050 category = annotation.getType();
051 }
052
053 /** Returns a normalised string for the entity. This is the string from the
054 * text document the entity was descovered in, with all whitespace sequences
055 * replaced by a single space character
056 */
057 public String getString(){
058 return string;
059 }
060
061 /** Returns the category of the entity*/
062 public String getCategory(){
063 return category;
064 }
065
066 /** Returns a pair of integers specifying the character offsets in the
067 * original file where the entity occured
068 */
069 public int[] getOffsets(){
070 return offsets;
071 }
072
073 /** Returns a string giving the category, offsets and normalised string for
074 * the entity, with no newlines.
075 */
076 public String toString(){
077 return category + " " + offsets[0] + " " + offsets[1] + " " + string;
078 }
079
080 String string;
081 String category;
082 int[] offsets;
083
084 /** Normalises a string. That is removes all the leading and trailing
085 * whitespace characters and replaces all inner whitespace sequences with a
086 * single space character
087 */
088 protected String normaliseString(String text){
089 /// String res = "";
090 StringBuffer res = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
091 if(text == null) return null;
092 int charIdx = 0;
093 boolean lastWasSpace = false;
094 //skip the leading spaces
095 while(charIdx < text.length() &&
096 Character.isWhitespace(text.charAt(charIdx))) charIdx++;
097 //parse the rest of the text
098 while(charIdx < text.length()){
099 if(Character.isWhitespace(text.charAt(charIdx))){
100 //reading spaces
101 lastWasSpace = true;
102 }else{
103 //reading non-spaces
104 if(lastWasSpace) ///res += " ";
105 res.append(" ");
106 /// res += text.charAt(charIdx);
107 res.append(text.charAt(charIdx));
108 lastWasSpace = false;
109 }
110 charIdx++;
111 }//while(charIdx < text.length())
112 return res.toString();
113 }
114
115 }
|