001 /**
002 * Copyright (c) 1995-2010, The University of Sheffield. See the file
003 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
004 *
005 * This file is part of GATE (see http://gate.ac.uk/), and is free
006 * software, licenced under the GNU Library General Public License,
007 * Version 2, June 1991 (in the distribution as file licence.html,
008 * and also available at http://gate.ac.uk/gate/licence.html).
009 *
010 * Thomas Heitz - 09/06/2010
011 *
012 * $Id$
013 */
014
015 package gate.util;
016
017 import gate.Annotation;
018
019 import java.io.BufferedReader;
020 import java.io.FileInputStream;
021 import java.io.IOException;
022 import java.io.InputStreamReader;
023 import java.net.URL;
024 import java.text.NumberFormat;
025 import java.util.*;
026
027 /**
028 * Modified version of Precision and Recall called BDM that takes into
029 * account the distance of two concepts in an ontology.
030 */
031 public class OntologyMeasures {
032
033 public OntologyMeasures() {
034 // empty constructor
035 }
036
037 /**
038 * Constructor to be used when you have a collection of OntologyMeasures
039 * and want to consider it as only one OntologyMeasures.
040 * Then you can only use the methods getPrecision/Recall/FMeasure...().
041 * @param measures collection to be regrouped in one OntologyMeasures
042 */
043 public OntologyMeasures(Collection<OntologyMeasures> measures) {
044 Map<String, List<AnnotationDiffer>> differsByTypeMap =
045 new HashMap<String, List<AnnotationDiffer>>();
046 for (OntologyMeasures measure : measures) {
047 for (Map.Entry<String, Float> entry : measure.bdmByTypeMap.entrySet()) {
048 float previousBdm = 0;
049 if (bdmByTypeMap.containsKey(entry.getKey())) {
050 previousBdm = bdmByTypeMap.get(entry.getKey());
051 }
052 // set the bdmByTypeMap to be the sum of those in the collection
053 bdmByTypeMap.put(entry.getKey(), previousBdm + entry.getValue());
054 }
055 for (Map.Entry<String, AnnotationDiffer> entry :
056 measure.differByTypeMap.entrySet()) {
057 List<AnnotationDiffer> differs = differsByTypeMap.get(entry.getKey());
058 if (differs == null) {
059 differs = new ArrayList<AnnotationDiffer>();
060 }
061 differs.add(entry.getValue());
062 differsByTypeMap.put(entry.getKey(), differs);
063 }
064 }
065 // combine the list of AnnotationDiffer for each type
066 for (Map.Entry<String, List<AnnotationDiffer>> entry :
067 differsByTypeMap.entrySet()) {
068 differByTypeMap.put(entry.getKey(),
069 new AnnotationDiffer(entry.getValue()));
070 }
071 }
072
073 /**
074 * For a document get the annotation differs that contain the type to compare
075 * and the annotation differs that may have miscategorized annotations
076 * for this type. Then we try to find miscategorized types that are close
077 * enough from the main type and use their BDM value to get an augmented
078 * precision, recall and fscore.
079 *
080 * @param differs annotation differ for the type and for possible
081 * miscategorized types.
082 */
083 public void calculateBdm(Collection<AnnotationDiffer> differs) {
084
085 if (bdmByConceptsMap == null) {
086 // load BDM file with scores for each concept/annotation type pair
087 bdmByConceptsMap = read(bdmFileUrl); // read the bdm scores
088 }
089
090 // calculate BDM from the spurious and missing annotations
091 Set<Annotation> unpairedResponseAnnotations = new HashSet<Annotation>();
092 Set<Annotation> unpairedKeyAnnotations;
093
094 // will use the whole spurious annotations as the second set to compare
095 for (AnnotationDiffer differ : differs) {
096 unpairedResponseAnnotations.addAll(
097 differ.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE));
098 }
099
100 bdmByTypeMap.clear();
101
102 for (AnnotationDiffer differ : differs) {
103 unpairedKeyAnnotations = differ.getAnnotationsOfType(
104 AnnotationDiffer.MISSING_TYPE);
105 if (!bdmByTypeMap.containsKey(differ.getAnnotationType())) {
106 bdmByTypeMap.put(differ.getAnnotationType(), 0f);
107 }
108
109 // use the missing annotations as the first set to compare
110 for (Annotation unpairedKeyAnnotation : unpairedKeyAnnotations) {
111 String type = unpairedKeyAnnotation.getType();
112 // Out.prln("unpairedKeyAnnotation: " + unpairedKeyAnnotation.toString());
113 Iterator<Annotation> iterator = unpairedResponseAnnotations.iterator();
114
115 // use the spurious annotations as the second set to compare
116 while (iterator.hasNext()) {
117 Annotation unpairedResponseAnnotation = iterator.next();
118 // Out.prln("unpairedResponsAnnotation: "
119 // + unpairedResponseAnnotation.toString());
120 float bdm = 0;
121
122 // annotations have the same start and end offsets
123 if (unpairedKeyAnnotation.coextensive(unpairedResponseAnnotation)) {
124
125 // compare both features values with BDM pairs
126 if (differ.getSignificantFeaturesSet() != null) {
127 if (!type.equals(unpairedResponseAnnotation.getType())) {
128 continue; // types must be the same
129 }
130 for (Object feature : differ.getSignificantFeaturesSet()) {
131 if (unpairedKeyAnnotation.getFeatures() == null
132 || unpairedResponseAnnotation.getFeatures() == null) {
133 continue;
134 }
135 // Out.prln("Feature: " + feature);
136 String keyLabel = (String)
137 unpairedKeyAnnotation.getFeatures().get(feature);
138 // Out.prln("KeyLabel: " + keyLabel);
139 String responseLabel = (String)
140 unpairedResponseAnnotation.getFeatures().get(feature);
141 // Out.prln("ResponseLabel: " + responseLabel);
142 if (keyLabel == null || responseLabel == null) {
143 // do nothing
144 } else if (bdmByConceptsMap.containsKey(
145 keyLabel + ", " + responseLabel)) {
146 bdm += bdmByConceptsMap.get(keyLabel + ", " + responseLabel);
147 } else if (bdmByConceptsMap.containsKey(
148 responseLabel + ", " + keyLabel)) {
149 bdm += bdmByConceptsMap.get(responseLabel + ", " + keyLabel);
150 }
151 }
152 bdm = bdm / differ.getSignificantFeaturesSet().size();
153
154 } else { // compare both types with BDM pairs
155 if (bdmByConceptsMap.containsKey(
156 type + ',' + unpairedResponseAnnotation.getType())) {
157 bdm = bdmByConceptsMap.get(
158 type + ',' + unpairedResponseAnnotation.getType());
159 } else if (bdmByConceptsMap.containsKey(
160 unpairedResponseAnnotation.getType() + ", " + type)) {
161 bdm = bdmByConceptsMap.get(
162 unpairedResponseAnnotation.getType() + ", " + type);
163 }
164 }
165 if (bdm > 0) {
166 bdmByTypeMap.put(type, bdmByTypeMap.get(type) + bdm);
167 iterator.remove();
168 // Out.prln("BDM: " + bdmByTypeMap.get(type));
169 }
170 }
171 }
172 }
173 }
174
175 differByTypeMap.clear();
176 Map<String, List<AnnotationDiffer>> differsByTypeMap =
177 new HashMap<String, List<AnnotationDiffer>>();
178
179 for (AnnotationDiffer differ : differs) {
180 // we consider that all annotations in AnnotationDiffer are the same type
181 String type = differ.getAnnotationType();
182 List<AnnotationDiffer> differsType = differsByTypeMap.get(type);
183 if (differsType == null) {
184 differsType = new ArrayList<AnnotationDiffer>();
185 }
186 differsType.add(differ);
187 differsByTypeMap.put(type, differsType);
188 }
189
190 // combine the list of AnnotationDiffer for each type
191 for (Map.Entry<String, List<AnnotationDiffer>> entry :
192 differsByTypeMap.entrySet()) {
193 differByTypeMap.put(entry.getKey(),
194 new AnnotationDiffer(entry.getValue()));
195 }
196 }
197
198 /**
199 * AP = (sum of BDMs for BDM-matching pair spurious/missing + Correct)
200 * / (Correct + Spurious)
201 * @param type annotation type
202 * @return strict precision with BDM correction
203 */
204 public double getPrecisionStrictBdm(String type) {
205 AnnotationDiffer differ = differByTypeMap.get(type);
206 if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
207 return 1.0;
208 }
209 return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
210 / (differ.getCorrectMatches() + differ.getSpurious());
211 }
212
213 public double getPrecisionStrictBdm() {
214 double result = 0;
215 for (String type : differByTypeMap.keySet()) {
216 result += getPrecisionStrictBdm(type);
217 }
218 return result / differByTypeMap.size();
219 }
220
221 public double getRecallStrictBdm(String type) {
222 AnnotationDiffer differ = differByTypeMap.get(type);
223 if (differ.getCorrectMatches() + differ.getMissing() == 0) {
224 return 1.0;
225 }
226 return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
227 / (differ.getCorrectMatches() + differ.getMissing());
228 }
229
230 public double getRecallStrictBdm() {
231 double result = 0;
232 for (String type : differByTypeMap.keySet()) {
233 result += getRecallStrictBdm(type);
234 }
235 return result / differByTypeMap.size();
236 }
237
238 public double getFMeasureStrictBdm(String type, double beta) {
239 double precision = getPrecisionStrictBdm(type);
240 double recall = getRecallStrictBdm(type);
241 double betaSq = beta * beta;
242 double answer = ((betaSq + 1) * precision * recall)
243 / (betaSq * precision + recall);
244 if(Double.isNaN(answer)) answer = 0.0;
245 return answer;
246 }
247
248 public double getFMeasureStrictBdm(double beta) {
249 double result = 0;
250 for (String type : differByTypeMap.keySet()) {
251 result += getFMeasureStrictBdm(type, beta);
252 }
253 return result / differByTypeMap.size();
254 }
255
256 public double getPrecisionLenientBdm(String type) {
257 AnnotationDiffer differ = differByTypeMap.get(type);
258 if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
259 return 1.0;
260 }
261 return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
262 + differ.getPartiallyCorrectMatches())
263 / (differ.getCorrectMatches() + differ.getSpurious());
264 }
265
266 public double getPrecisionLenientBdm() {
267 double result = 0;
268 for (String type : differByTypeMap.keySet()) {
269 result += getPrecisionLenientBdm(type);
270 }
271 return result / differByTypeMap.size();
272 }
273
274 public double getRecallLenientBdm(String type) {
275 AnnotationDiffer differ = differByTypeMap.get(type);
276 if (differ.getCorrectMatches() + differ.getMissing() == 0) {
277 return 1.0;
278 }
279 return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
280 + differ.getPartiallyCorrectMatches())
281 / (differ.getCorrectMatches() + differ.getMissing());
282 }
283
284 public double getRecallLenientBdm() {
285 double result = 0;
286 for (String type : differByTypeMap.keySet()) {
287 result += getRecallLenientBdm(type);
288 }
289 return result / differByTypeMap.size();
290 }
291
292 public double getFMeasureLenientBdm(String type, double beta) {
293 double precision = getPrecisionLenientBdm(type);
294 double recall = getRecallLenientBdm(type);
295 double betaSq = beta * beta;
296 double answer = ((betaSq + 1) * precision * recall)
297 / (betaSq * precision + recall);
298 if(Double.isNaN(answer)) answer = 0.0;
299 return answer;
300 }
301
302 public double getFMeasureLenientBdm(double beta) {
303 double result = 0;
304 for (String type : differByTypeMap.keySet()) {
305 result += getFMeasureLenientBdm(type, beta);
306 }
307 return result / differByTypeMap.size();
308 }
309
310 public double getPrecisionAverageBdm(String type) {
311 return (getPrecisionLenientBdm(type) + getPrecisionStrictBdm(type)) / 2.0;
312 }
313
314 /**
315 * Gets the average of the strict and lenient precision values.
316 * @return a <tt>double</tt> value.
317 */
318 public double getPrecisionAverageBdm() {
319 return (getPrecisionLenientBdm() + getPrecisionStrictBdm()) / 2.0;
320 }
321
322 public double getRecallAverageBdm(String type) {
323 return (getRecallLenientBdm(type) + getRecallStrictBdm(type)) / 2.0;
324 }
325
326 /**
327 * Gets the average of the strict and lenient recall values.
328 * @return a <tt>double</tt> value.
329 */
330 public double getRecallAverageBdm() {
331 return (getRecallLenientBdm() + getRecallStrictBdm()) / 2.0;
332 }
333
334 public double getFMeasureAverageBdm(String type, double beta) {
335 return (getFMeasureLenientBdm(type, beta)
336 + getFMeasureStrictBdm(type, beta))
337 / 2.0;
338 }
339
340 /**
341 * Gets the average of strict and lenient F-Measure values.
342 * @param beta The relative weight of precision and recall. A value of 1
343 * gives equal weights to precision and recall. A value of 0 takes the recall
344 * value completely out of the equation.
345 * @return a <tt>double</tt>value.
346 */
347 public double getFMeasureAverageBdm(double beta) {
348 return (getFMeasureLenientBdm(beta) + getFMeasureStrictBdm(beta)) / 2.0;
349 }
350
351 public void setBdmFile(URL url) {
352 bdmFileUrl = url;
353 bdmByConceptsMap = null;
354 }
355
356 /**
357 * Read the BDM scores from a file.
358 * @param bdmFile URL of the BDM file
359 * @return map from a pair of concepts to their BDM score
360 */
361 public Map<String, Float> read(URL bdmFile) {
362 Map<String, Float> bdmByConceptsMap = new HashMap<String, Float>();
363 if (bdmFile == null) {
364 Out.prln("There is no BDM file specified.");
365 return bdmByConceptsMap;
366 }
367 BufferedReader bdmResultsReader = null;
368 try {
369 bdmResultsReader = new BomStrippingInputStreamReader(
370 new FileInputStream(Files.fileFromURL(bdmFile)), "UTF-8");
371 bdmResultsReader.readLine(); // skip the first line as the header
372 String line = bdmResultsReader.readLine();
373 while (line != null) {
374 String[] terms = line.split(", ");
375 if (terms.length > 3) {
376 String oneCon = terms[0].substring(4);
377 String anoCon = terms[1].substring(9);
378 String bdmS = terms[2].substring(4);
379 bdmByConceptsMap.put(oneCon + ", " + anoCon, new Float(bdmS));
380 } else {
381 Out.prln("File " + bdmFile.toString() + " has incorrect format" +
382 "for the line [" + line + "].");
383 }
384 line = bdmResultsReader.readLine();
385 }
386
387 } catch(Exception e) {
388 Out.prln("There is something wrong with the BDM file.");
389 e.printStackTrace();
390
391 } finally {
392 if (bdmResultsReader != null) {
393 try {
394 bdmResultsReader.close();
395 } catch (IOException e) {
396 e.printStackTrace();
397 }
398 }
399 }
400 return bdmByConceptsMap;
401 }
402
403 public List<String> getMeasuresRow(Object[] measures, String title) {
404 List<AnnotationDiffer> differs = new ArrayList<AnnotationDiffer>(
405 getDifferByTypeMap().values());
406 AnnotationDiffer differ = new AnnotationDiffer(differs);
407 NumberFormat f = NumberFormat.getInstance(Locale.ENGLISH);
408 f.setMaximumFractionDigits(2);
409 f.setMinimumFractionDigits(2);
410 List<String> row = new ArrayList<String>();
411 row.add(title);
412 row.add(Integer.toString(differ.getCorrectMatches()));
413 row.add(Integer.toString(differ.getMissing()));
414 row.add(Integer.toString(differ.getSpurious()));
415 row.add(Integer.toString(differ.getPartiallyCorrectMatches()));
416 for (Object object : measures) {
417 String measure = (String) object;
418 double beta = Double.valueOf(
419 measure.substring(1,measure.indexOf('-')));
420 if (measure.endsWith("strict")) {
421 row.add(f.format(differ.getRecallStrict()));
422 row.add(f.format(differ.getPrecisionStrict()));
423 row.add(f.format(differ.getFMeasureStrict(beta)));
424 } else if (measure.endsWith("strict BDM")) {
425 row.add(f.format(getRecallStrictBdm()));
426 row.add(f.format(getPrecisionStrictBdm()));
427 row.add(f.format(getFMeasureStrictBdm(beta)));
428 } else if (measure.endsWith("lenient")) {
429 row.add(f.format(differ.getRecallLenient()));
430 row.add(f.format(differ.getPrecisionLenient()));
431 row.add(f.format(differ.getFMeasureLenient(beta)));
432 } else if (measure.endsWith("lenient BDM")) {
433 row.add(f.format(getRecallLenientBdm()));
434 row.add(f.format(getPrecisionLenientBdm()));
435 row.add(f.format(getFMeasureLenientBdm(beta)));
436 } else if (measure.endsWith("average")) {
437 row.add(f.format(differ.getRecallAverage()));
438 row.add(f.format(differ.getPrecisionAverage()));
439 row.add(f.format(differ.getFMeasureAverage(beta)));
440 } else if (measure.endsWith("average BDM")) {
441 row.add(f.format(getRecallAverageBdm()));
442 row.add(f.format(getPrecisionAverageBdm()));
443 row.add(f.format(getFMeasureAverageBdm(beta)));
444 }
445 }
446 return row;
447 }
448
449 /**
450 * Be careful, don't modify it.
451 * That's not a copy because it would take too much memory.
452 * @return differ by type map
453 */
454 public Map<String, AnnotationDiffer> getDifferByTypeMap() {
455 return differByTypeMap;
456 }
457
458 protected Map<String, Float> bdmByTypeMap = new HashMap<String, Float>();
459 protected URL bdmFileUrl;
460 protected Map<String, AnnotationDiffer> differByTypeMap =
461 new HashMap<String, AnnotationDiffer>();
462 protected Map<String, Float> bdmByConceptsMap;
463 }
|