001 package gate.creole.annic.lucene;
002
003 import java.io.*;
004 import java.util.*;
005 import gate.creole.annic.Constants;
006 import gate.creole.annic.Hit;
007 import gate.creole.annic.Pattern;
008 import gate.creole.annic.PatternAnnotation;
009 import gate.creole.annic.SearchException;
010 import gate.creole.annic.apache.lucene.index.Term;
011 import gate.creole.annic.apache.lucene.search.*;
012
013 public class StatsCalculator {
014
015 /**
016 * Allows retriving frequencies for the given parameters. Please make
017 * sure that you close the searcher on your own. Failing to do so may
018 * result into many files being opened at the same time and that can
019 * cause the problem with your OS.
020 *
021 * @param searcher
022 * @param corpusToSearchIn
023 * @param annotationSetToSearchIn
024 * @param annotationType
025 * @param featureName
026 * @param value
027 * @return
028 * @throws SearchException
029 */
030 public static int freq(IndexSearcher searcher, String corpusToSearchIn,
031 String annotationSetToSearchIn, String annotationType,
032 String featureName, String value) throws SearchException {
033
034 try {
035 corpusToSearchIn = corpusToSearchIn == null
036 || corpusToSearchIn.trim().length() == 0
037 ? null
038 : corpusToSearchIn.trim();
039 annotationSetToSearchIn = annotationSetToSearchIn == null
040 || annotationSetToSearchIn.trim().length() == 0
041 ? null
042 : annotationSetToSearchIn.trim();
043 if(annotationType == null)
044 throw new SearchException("Annotation Type cannot be null");
045
046 // term that contains a value to be searched in the index
047 Term term = null;
048 if(featureName == null && value == null) {
049 term = new Term("contents", annotationType, "*");
050 }
051 else if(featureName != null && value == null) {
052 term = new Term("contents", annotationType + "." + featureName, "**");
053 }
054 else if(featureName == null) {
055 throw new SearchException("FeatureName cannot be null");
056 }
057 else {
058 term = new Term("contents", value, annotationType + "." + featureName);
059 }
060
061 // term query
062 TermQuery tq = new TermQuery(term);
063
064 // indicates whether we want to use booleanQuery
065 boolean useBooleanQuery = false;
066 BooleanQuery bq = new BooleanQuery();
067
068 if(corpusToSearchIn != null) {
069 PhraseQuery cq = new PhraseQuery();
070 cq.add(new Term(Constants.CORPUS_ID, corpusToSearchIn), new Integer(0),
071 true);
072 bq.add(cq, true, false);
073 useBooleanQuery = true;
074 }
075
076 if(annotationSetToSearchIn != null) {
077 PhraseQuery aq = new PhraseQuery();
078 aq.add(new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn),
079 new Integer(0), true);
080 bq.add(aq, true, false);
081 useBooleanQuery = true;
082 }
083
084 Hits corpusHits = null;
085 if(useBooleanQuery) {
086 bq.add(tq, true, false);
087 corpusHits = searcher.search(bq);
088 }
089 else {
090 corpusHits = searcher.search(tq);
091 }
092
093 ArrayList[] firstTermPositions = searcher.getFirstTermPositions();
094
095 // if no result available, set null to our scores
096 if(firstTermPositions[0].size() == 0) {
097 return 0;
098 }
099
100 int size = 0;
101 // iterate through each result and collect necessary
102 // information
103 for(int hitIndex = 0; hitIndex < corpusHits.length(); hitIndex++) {
104 int index = firstTermPositions[0].indexOf(new Integer(corpusHits
105 .id(hitIndex)));
106
107 // we fetch all the first term positions for the query
108 // issued
109 Integer freq = (Integer)firstTermPositions[4].get(index);
110 size += freq.intValue();
111 }
112 return size;
113 }
114 catch(IOException ioe) {
115 throw new SearchException(ioe);
116 }
117 finally {
118 searcher.initializeTermPositions();
119 }
120 }
121
122 /**
123 * @see #freq(IndexSearcher, String, String, String, String, String)
124 */
125 public static int freq(IndexSearcher searcher, String corpusToSearchIn,
126 String annotationSetToSearchIn, String annotationType)
127 throws SearchException {
128
129 return freq(searcher, corpusToSearchIn, annotationSetToSearchIn,
130 annotationType, null, null);
131 }
132
133 /**
134 * @see #freq(IndexSearcher, String, String, String, String, String)
135 */
136 public static int freq(IndexSearcher searcher, String corpusToSearchIn,
137 String annotationSetToSearchIn, String annotationType,
138 String featureName) throws SearchException {
139
140 return freq(searcher, corpusToSearchIn, annotationSetToSearchIn,
141 annotationType, featureName, null);
142 }
143
144 /**
145 * Allows retrieving frequencies for the given parameters.
146 * @param patternsToSearchIn
147 * @param annotationType
148 * @param feature
149 * @param value - set to null if only wants to retrieve frequencies for AT.feature
150 * @param inMatchedSpan - true if only interested in frequencies from the matched spans.
151 * @param inContext - true if only interested in frequencies from the contexts. Please note that both isMatchedSpan
152 * and inContext can be set to true if interested in frequencies from the entire patterns, but cannot be set false
153 * at the same time.
154 * @return
155 * @throws SearchException
156 */
157 public static int freq(List<Hit> patternsToSearchIn,
158 String annotationType, String feature, String value,
159 boolean inMatchedSpan, boolean inContext) throws SearchException {
160 if(patternsToSearchIn == null || patternsToSearchIn.isEmpty()) return 0;
161
162 if(!inMatchedSpan && !inContext)
163 throw new SearchException(
164 "Both inMatchedSpan and inContext cannot be set to false");
165
166 int count = 0;
167 for(Hit aResult1 : patternsToSearchIn) {
168 Pattern aResult = (Pattern) aResult1;
169
170 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
171 if(inMatchedSpan && !inContext) {
172 annots = aResult.getPatternAnnotations(aResult.getStartOffset(),
173 aResult.getEndOffset());
174 }
175 else if(!inMatchedSpan && inContext) {
176 annots = aResult.getPatternAnnotations(aResult
177 .getLeftContextStartOffset(), aResult.getStartOffset());
178 annots.addAll(aResult.getPatternAnnotations(aResult.getEndOffset(),
179 aResult.getRightContextEndOffset()));
180 }
181 else {
182 // both matchedSpan and context are set to true
183 annots = Arrays.asList(aResult.getPatternAnnotations());
184 }
185
186 if(annots.isEmpty()) continue;
187 List<PatternAnnotation> subAnnots = null;
188 if(value == null) {
189 subAnnots = getPatternAnnotations(annots, annotationType, feature);
190 }
191 else {
192 subAnnots = getPatternAnnotations(annots, annotationType, feature,
193 value);
194 }
195
196 count += subAnnots.size();
197 }
198 return count;
199 }
200
201
202 /**
203 * @see #freq(List<Hit>, String, String, String, boolean, boolean)
204 */
205 public static int freq(List<Hit> patternsToSearchIn,
206 String annotationType, boolean inMatchedSpan, boolean inContext) throws SearchException {
207 if(patternsToSearchIn == null || patternsToSearchIn.isEmpty()) return 0;
208
209 if(!inMatchedSpan && !inContext)
210 throw new SearchException(
211 "Both inMatchedSpan and inContext cannot be set to false");
212
213 int count = 0;
214 for(Hit aResult1 : patternsToSearchIn) {
215 Pattern aResult = (Pattern) aResult1;
216
217
218 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
219 if(inMatchedSpan && !inContext) {
220 annots = aResult.getPatternAnnotations(aResult.getStartOffset(),
221 aResult.getEndOffset());
222 }
223 else if(!inMatchedSpan && inContext) {
224 annots = aResult.getPatternAnnotations(aResult
225 .getLeftContextStartOffset(), aResult.getStartOffset());
226 annots.addAll(aResult.getPatternAnnotations(aResult.getEndOffset(),
227 aResult.getRightContextEndOffset()));
228 }
229 else {
230 // both matchedSpan and context are set to true
231 annots = Arrays.asList(aResult.getPatternAnnotations());
232 }
233
234 if(annots.isEmpty()) continue;
235 List<PatternAnnotation> subAnnots = getPatternAnnotations(annots, annotationType);
236
237 count += subAnnots.size();
238 }
239 return count;
240 }
241
242
243 /**
244 * Calculates frequencies for all possible values of the provided AT.feature
245 * @param patternsToSearchIn
246 * @param annotationType
247 * @param feature
248 * @param inMatchedSpan
249 * @param inContext
250 * @return returns a map where key is the unique value of AT.feature and value is the Integer object giving count for the value.
251 * @throws SearchException
252 */
253 public static Map<String, Integer> freqForAllValues(
254 List<Hit> patternsToSearchIn, String annotationType,
255 String feature, boolean inMatchedSpan, boolean inContext)
256 throws SearchException {
257 Map<String, Integer> toReturn = new HashMap<String, Integer>();
258 if(patternsToSearchIn == null || patternsToSearchIn.isEmpty())
259 return toReturn;
260
261
262 if(!inMatchedSpan && !inContext)
263 throw new SearchException(
264 "Both inMatchedSpan and inContext cannot be set to false");
265
266 for(Hit aResult1 : patternsToSearchIn) {
267 Pattern aResult = (Pattern) aResult1;
268
269
270 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
271 if(inMatchedSpan && !inContext) {
272 annots = aResult.getPatternAnnotations(aResult.getStartOffset(),
273 aResult.getEndOffset());
274 }
275 else if(!inMatchedSpan && inContext) {
276 annots = aResult.getPatternAnnotations(aResult
277 .getLeftContextStartOffset(), aResult.getStartOffset());
278 annots.addAll(aResult.getPatternAnnotations(aResult.getEndOffset(),
279 aResult.getRightContextEndOffset()));
280 }
281 else {
282 // both matchedSpan and context are set to true
283 annots = Arrays.asList(aResult.getPatternAnnotations());
284 }
285
286 if(annots.isEmpty()) continue;
287 List<PatternAnnotation> subAnnots = getPatternAnnotations(annots,
288 annotationType, feature);
289
290 for(PatternAnnotation pa : subAnnots) {
291 String uniqueKey = pa.getFeatures().get(feature);
292 Integer counter = toReturn.get(uniqueKey);
293 if(counter == null) {
294 counter = new Integer(1);
295 toReturn.put(uniqueKey, counter);
296 }
297 else {
298 counter = new Integer(counter.intValue() + 1);
299 toReturn.put(uniqueKey, counter);
300 }
301 }
302 }
303 return toReturn;
304 }
305
306 private static List<PatternAnnotation> getPatternAnnotations(
307 List<PatternAnnotation> annotations, String type, String feature,
308 String value) {
309 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
310 for(int i = 0; i < annotations.size(); i++) {
311 PatternAnnotation ga1 = annotations.get(i);
312 if(ga1.getType().equals(type)) {
313 Map<String, String> features = ga1.getFeatures();
314 if(features != null && features.keySet().contains(feature)) {
315 if(features.get(feature).equals(value)) annots.add(ga1);
316 }
317 }
318 }
319 return annots;
320 }
321
322 private static List<PatternAnnotation> getPatternAnnotations(
323 List<PatternAnnotation> annotations, String type, String feature) {
324 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
325 for(int i = 0; i < annotations.size(); i++) {
326 PatternAnnotation ga1 = annotations.get(i);
327 if(ga1.getType().equals(type)) {
328 Map<String, String> features = ga1.getFeatures();
329 if(features != null && features.keySet().contains(feature)) {
330 annots.add(ga1);
331 }
332 }
333 }
334 return annots;
335 }
336
337 private static List<PatternAnnotation> getPatternAnnotations(
338 List<PatternAnnotation> annotations, String type) {
339 List<PatternAnnotation> annots = new ArrayList<PatternAnnotation>();
340 for(int i = 0; i < annotations.size(); i++) {
341 PatternAnnotation ga1 = annotations.get(i);
342 if(ga1.getType().equals(type)) {
343 annots.add(ga1);
344 }
345 }
346 return annots;
347 }
348
349
350 }
|