001 package gate.creole.orthomatcher;
002
003 import gate.Annotation;
004 import gate.AnnotationSet;
005 import gate.creole.ExecutionException;
006 import gate.util.InvalidOffsetException;
007
008 import java.util.Arrays;
009 import java.util.HashMap;
010 import java.util.HashSet;
011 import java.util.Iterator;
012 import java.util.Map;
013
014 import org.apache.log4j.Logger;
015
016 public class OrthoMatcherHelper {
017
018 protected static final Logger log = Logger.getLogger(OrthoMatcherHelper.class);
019
020 public static boolean straightCompare(String s1,
021 String s2,
022 boolean matchCase) {
023
024 boolean matched = false;
025 if (!matchCase)
026 matched = s1.equalsIgnoreCase(s2);
027 else matched = s1.equals(s2) ;
028 // kalina: do not remove, nice for debug
029 // if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
030 // Out.prln("Rule1: Matched " + s1 + "and " + s2);
031 return matched;
032 }
033
034 /**
035 * Returns true if only one of s1 and s2 is a single character and the two strings match on that
036 * initial
037 *
038 * @param s1
039 * @param s2
040 * @return
041 */
042 public static boolean initialMatch(String s1, String s2) {
043 return (((s1.length() == 1) ^ (s2.length() == 1) ) && (s1.charAt(0) == s2.charAt(0)));
044 }
045
046 /**
047 * Gets the content of an annotation
048 */
049 public static String getStringForSpan(Long start, Long end,gate.Document d) throws ExecutionException {
050 try {
051 return d.getContent().getContent(start, end).toString();
052 }
053 catch (InvalidOffsetException e) {
054 //log.error("Weird offset exception in getStringForSpan", e);
055 throw new ExecutionException(e);
056 }
057 }
058
059 public static boolean executeDisjunction(Map<Integer,OrthoMatcherRule> allrules, int[] executeRules,String longName,String shortName, boolean mr[]) {
060
061 boolean result=false;
062
063 for (int i = 0; i < executeRules.length; i = i + 1) {
064
065 boolean current=allrules.get(executeRules[i]).value(longName, shortName);
066 mr[executeRules[i]]=current;
067 result=result || current;
068 }
069
070 return result;
071 }
072
073 public static Double round2Places(Double input) {
074 return Math.round(input*100.0)/100.0;
075 }
076
077 /**
078 * It is used for test purposes.
079 * This table shows which rules have fired over a corpus.
080 */
081 public static boolean[] rulesUsedTable=null;
082
083 /**
084 * It is used for test purposes.
085 * It sets that a specific rule has returned 'true'.
086 */
087 public static void usedRule(int rule) {
088
089 if (rulesUsedTable==null) {
090 rulesUsedTable = new boolean[18];
091 for(int i=0;i<rulesUsedTable.length;i++) rulesUsedTable[i]=false;
092 }
093
094 rulesUsedTable[rule]=true;
095
096 }
097
098 /**
099 * It is used for test purposes.
100 * It saves which rules have fired(have returned 'true') while processing a corpus
101 * Must be enabled - uncommented
102 */
103 public static void saveUsedTable() {
104
105 //Iterator<Map.Entry<Integer, Boolean>> iter = rulesUsedTable.entrySet().iterator();
106 if (rulesUsedTable!=null) {
107 log.debug("Saving table of used orthomatcher rules:");
108
109 String table="";
110
111 for(int i=0;i<rulesUsedTable.length;i++) table+="Rule: "+i+" fired: "+rulesUsedTable[i]+"\r\n";
112
113 log.debug(table);
114 log.debug("End of table of used Orthomatcher rules");
115 }
116 else log.debug("Could not save the table of used orthomatcher rules. This also results when no Orthomatcher rule has returned 'true'.");
117 }
118
119 /*
120 * Converts a string array to an integer one.
121 */
122 public static int[] convertArrayToInteger(String[] input) {
123
124 int[] result=new int[input.length];
125
126 for(int i=0;i<input.length;i++) {
127 result[i] = Integer.parseInt(input[i].trim());
128 }
129
130 return result;
131 }
132 /*
133 * It sorts a list of pairs by the first number which is the start point of an annotation.
134 * It encodes the pair in a single number, sorts by this number and then decodes to the original pair.
135 */
136 public static String SortByStartPosition(String input) {
137
138 int ceil=100000;//a and b both must be less than ceil
139
140 String[] pairs = input.trim().split(",");
141
142 int[] temp=new int[pairs.length];
143
144 if (pairs.length>1) {
145
146 int i=0;
147 //encode in temp
148 for(String pair: pairs){
149
150 String[] s = pair.split(":");
151 int x=Integer.parseInt(s[0].trim())* ceil + Integer.parseInt(s[1].trim());
152 temp[i]=x;
153 i++;
154 }
155
156 Arrays.sort(temp);
157
158 //decode from temp
159 String result="";
160 for(int n: temp) {
161 int a = n / ceil;
162 int b = n % ceil;
163 result=result+a+":"+b+", ";
164 }
165
166 return result;
167 }
168 else return input;//we do not need to sort a single pair
169 }
170
171 /*
172 * The feature "matches" contains annotation IDs.
173 * This method adds a new feature called "matches_positions" that tells the exact position of each match annotation from "matches".
174 * "matches" contains annotations IDs which are in general different and can not be used for comparison in tools like the Corpus Quality Assurance tool
175 * "matches_positions" can be used to check if the matches really match in for example the Corpus Quality Assurance tool
176 */
177 protected static void setMatchesPositions(AnnotationSet nameAllAnnots) {
178
179 //get all annotations that have a matches feature
180 HashSet fNames = new HashSet();
181 fNames.add(gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME);
182 AnnotationSet allMatchesAnnots =
183 nameAllAnnots.get(null, fNames);
184
185 if (allMatchesAnnots == null || allMatchesAnnots.isEmpty())
186 return;
187
188 for (Annotation currentMatchAnnot : allMatchesAnnots) {
189
190 String matchValue=currentMatchAnnot.getFeatures().get(gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME).toString();
191
192 matchValue = matchValue.substring(1);
193 matchValue = matchValue.substring(0,matchValue.length()-1);
194
195 String[] annotationsIDs = matchValue.split(",");
196
197 String matchPositionsValue="";//with the annotations positions
198 String sentinel = ", ";
199
200 int[] integerIDs = OrthoMatcherHelper.convertArrayToInteger(annotationsIDs);
201 for (int i=0; i<integerIDs.length ; i++) {
202
203 int id=integerIDs[i];
204 Annotation ann=null;
205
206 Iterator<Annotation> iter = nameAllAnnots.iterator();
207
208 //find the current annotation with ID from the match list - in order to get its start and end point
209 if (currentMatchAnnot.getId()==id)
210 ann=currentMatchAnnot; else {
211 while (iter.hasNext()) {
212 Annotation a=iter.next();
213 if (a.getId()==id)
214 {
215 ann = a;
216 break;
217 }
218 }
219 }
220
221 //do the actual job of retrieving the start and end points
222 if (ann!=null) {
223 matchPositionsValue = matchPositionsValue + ann.getStartNode().getOffset()+":"+ann.getEndNode().getOffset()+sentinel;
224 }
225
226 }//end going through the match ids
227
228 //sort so that every time we have the "match_positions" generated the same way so that we can compare it
229 matchPositionsValue = OrthoMatcherHelper.SortByStartPosition(matchPositionsValue);
230
231 //formating
232 if (matchPositionsValue.endsWith(sentinel)) {
233 matchPositionsValue = matchPositionsValue.substring(0,matchPositionsValue.length()-sentinel.length());
234 }
235 matchPositionsValue = "["+matchPositionsValue+"]";
236 //finally insert the annotation
237 currentMatchAnnot.getFeatures().put("matches_positions", matchPositionsValue);
238
239 //}
240 } //while - going through all the matches annotations(that have a feature "match") and adding the new feature
241 }//matchesPositions
242
243
244 }
|