0001 /*
0002 * CorpusBenchmarkTool.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Kalina Bontcheva, 24/Oct/2001
0013 *
0014 * $Id: CorpusBenchmarkTool.java 12006 2009-12-01 17:24:28Z thomas_heitz $
0015 */
0016
0017 package gate.util;
0018
0019 import java.io.*;
0020 import java.util.*;
0021
0022 import gate.*;
0023 import gate.util.AnnotationDiffer;
0024 import gate.creole.*;
0025 import gate.persist.PersistenceException;
0026 import gate.persist.SerialDataStore;
0027
0028 public class CorpusBenchmarkTool {
0029 private static final String MARKED_DIR_NAME = "marked";
0030 private static final String CLEAN_DIR_NAME = "clean";
0031 private static final String CVS_DIR_NAME = "Cvs";
0032 private static final String PROCESSED_DIR_NAME = "processed";
0033 private static final String ERROR_DIR_NAME = "err";
0034
0035 private static final boolean DEBUG = true;
0036
0037 public CorpusBenchmarkTool() {}
0038
0039 public void initPRs() {
0040 try {
0041 if (applicationFile == null)
0042 Out.prln("Application not set!");
0043 Out.prln("App file is: " + applicationFile.getAbsolutePath());
0044 application = (Controller) gate.util.persistence.PersistenceManager
0045 .loadObjectFromFile(applicationFile);
0046 }
0047 catch (Exception ex) {
0048 throw (GateRuntimeException)
0049 new GateRuntimeException("Corpus Benchmark Tool:" + ex.getMessage())
0050 .initCause(ex);
0051 }
0052 } //initPRs
0053
0054 public void unloadPRs() {
0055 //we have nothing to unload if no PRs are loaded
0056 if (isMarkedStored)
0057 return;
0058
0059 }
0060
0061 public void execute() {
0062 execute(startDir);
0063 if (application != null) {
0064 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0065 public void run() {
0066
0067 Iterator iter = new ArrayList(application.getPRs()).iterator();
0068 while (iter.hasNext())
0069 Factory.deleteResource( (Resource) iter.next());
0070
0071 Factory.deleteResource(application);
0072 }
0073 });
0074 }
0075 }
0076
0077 public void init() {
0078 //first read the corpus_tool.properties file
0079 File propFile = new File("corpus_tool.properties");
0080 Out.prln(propFile.getAbsolutePath());
0081 if (propFile.exists()) {
0082 try {
0083 InputStream inputStream = new FileInputStream(propFile);
0084 this.configs.load(inputStream);
0085 String thresholdString = this.configs.getProperty("threshold");
0086 if (thresholdString != null && !thresholdString.equals("")) {
0087 thresholdString=thresholdString.trim();
0088 this.threshold = (new Double(thresholdString)).doubleValue();
0089 Out.prln("New threshold is: " + this.threshold + "<P>\n");
0090 }
0091 String setName = this.configs.getProperty("annotSetName");
0092 if (setName != null && !setName.equals("")) {
0093 setName=setName.trim();
0094 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
0095 this.annotSetName = setName;
0096 }
0097 setName = this.configs.getProperty("outputSetName");
0098 if (setName != null && !setName.equals("")) {
0099 setName=setName.trim();
0100 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
0101 this.outputSetName = setName;
0102 }
0103 String encodingString = this.configs.getProperty("encoding");
0104 if (encodingString != null && !encodingString.equals("")) {
0105 encodingString=encodingString.trim();
0106 this.documentEncoding = encodingString;
0107 Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
0108 }
0109 String types = this.configs.getProperty("annotTypes");
0110 if (types != null && !types.equals("")) {
0111 types=types.trim();
0112 Out.prln("Using annotation types from the properties file. <P>\n");
0113 StringTokenizer strTok = new StringTokenizer(types, ";");
0114 annotTypes = new ArrayList();
0115 while (strTok.hasMoreTokens())
0116 annotTypes.add(strTok.nextToken());
0117 }
0118 else {
0119 annotTypes = new ArrayList();
0120 annotTypes.add("Organization");
0121 annotTypes.add("Person");
0122 annotTypes.add("Date");
0123 annotTypes.add("Location");
0124 annotTypes.add("Address");
0125 annotTypes.add("Money");
0126 annotTypes.add("Percent");
0127 annotTypes.add("GPE");
0128 annotTypes.add("Facility");
0129 }
0130 String features = this.configs.getProperty("annotFeatures");
0131 HashSet result = new HashSet();
0132 if (features != null && !features.equals("")) {
0133 features=features.trim();
0134 Out.pr("Using annotation features from the properties file. \n");
0135 java.util.StringTokenizer tok =
0136 new java.util.StringTokenizer(features, ";");
0137 String current;
0138 while (tok.hasMoreTokens()) {
0139 current = tok.nextToken();
0140 result.add(current);
0141 } // while
0142 }
0143 diffFeaturesSet = result;
0144 Out.prln("Features: " + diffFeaturesSet + " <P>\n");
0145
0146 }
0147 catch (IOException ex) {
0148 //just ignore the file and go on with the defaults
0149 this.configs = new Properties();
0150 }
0151 }
0152 else
0153 this.configs = new Properties();
0154
0155 //we only initialise the PRs if they are going to be used
0156 //for processing unprocessed documents
0157 if (!this.isMarkedStored)
0158 initPRs();
0159
0160 }
0161
0162 public void execute(File dir) {
0163 if (dir == null)
0164 return;
0165 //first set the current directory to be the given one
0166 currDir = dir;
0167
0168 File processedDir = null;
0169 File cleanDir = null;
0170 File markedDir = null;
0171 File errorDir = null;
0172
0173 ArrayList subDirs = new ArrayList();
0174 File[] dirArray = currDir.listFiles();
0175 if (dirArray == null)return;
0176 for (int i = 0; i < dirArray.length; i++) {
0177 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
0178 continue;
0179 if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
0180 cleanDir = dirArray[i];
0181 else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
0182 markedDir = dirArray[i];
0183 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
0184 processedDir = dirArray[i];
0185 else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
0186 errorDir = dirArray[i];
0187 else
0188 subDirs.add(dirArray[i]);
0189 }
0190
0191 if (cleanDir == null)return;
0192 Out.prln("Processing directory: " + currDir + "<P>");
0193
0194 if (this.isGenerateMode)
0195 generateCorpus(cleanDir, processedDir);
0196 else
0197 evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
0198
0199 //if no more subdirs left, return
0200 if (subDirs.isEmpty())
0201 return;
0202
0203 //there are more subdirectories to traverse, so iterate through
0204 for (int j = 0; j < subDirs.size(); j++)
0205 execute( (File) subDirs.get(j));
0206
0207 } //execute(dir)
0208
0209 public static void main(String[] args) throws GateException {
0210 Out.prln("<HTML>");
0211 Out.prln("<HEAD>");
0212 Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
0213 for (int argC = 0; argC < args.length; ++argC)
0214 Out.pr(args[argC] + " ");
0215 Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
0216 Out.prln("<BODY>");
0217 Out.prln("Please wait while GATE tools are initialised. <P>");
0218 // initialise GATE
0219 Gate.init();
0220
0221 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
0222
0223 List inputFiles = null;
0224 if (args.length < 1)throw new GateException(usage);
0225 int i = 0;
0226 while (i < args.length && args[i].startsWith("-")) {
0227 if (args[i].equals("-generate")) {
0228 Out.prln("Generating the corpus... <P>");
0229 corpusTool.setGenerateMode(true);
0230 }
0231 else if (args[i].equals("-marked_clean")) {
0232 Out.prln("Evaluating current grammars against human-annotated...<P>");
0233 corpusTool.setMarkedClean(true);
0234 }
0235 else if (args[i].equals("-marked_stored")) {
0236 Out.prln("Evaluating stored documents against human-annotated...<P>");
0237 corpusTool.setMarkedStored(true);
0238 }
0239 else if (args[i].equals("-marked_ds")) {
0240 Out.prln("Looking for marked docs in a datastore...<P>");
0241 corpusTool.setMarkedDS(true);
0242 }
0243 else if (args[i].equals("-verbose")) {
0244 Out.prln("Running in verbose mode. Will generate annotation " +
0245 "information when precision/recall are lower than " +
0246 corpusTool.getThreshold() + "<P>");
0247 corpusTool.setVerboseMode(true);
0248 }
0249 else if (args[i].equals("-moreinfo")) {
0250 Out.prln("Show more details in document table...<P>");
0251 corpusTool.setMoreInfo(true);
0252 }
0253 i++; //just ignore the option, which we do not recognise
0254 } //while
0255
0256 String dirName = args[i];
0257 File dir = new File(dirName);
0258 if (!dir.isDirectory())
0259 throw new GateException(usage);
0260
0261 //get the last argument which is the application
0262 i++;
0263 String appName = args[i];
0264 File appFile = new File(appName);
0265 if (!appFile.isFile())
0266 throw new GateException(usage);
0267 else
0268 corpusTool.setApplicationFile(appFile);
0269
0270 corpusTool.init();
0271 corpusWordCount = 0;
0272
0273 Out.prln("Measuring annotaitions of types: " +
0274 CorpusBenchmarkTool.annotTypes + "<P>");
0275
0276 corpusTool.setStartDirectory(dir);
0277 corpusTool.execute();
0278 //if we're not generating the corpus, then print the precision and recall
0279 //statistics for the processed corpus
0280 if (!corpusTool.getGenerateMode())
0281 corpusTool.printStatistics();
0282
0283 Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
0284 Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
0285 Out.prln("<BR>Overall average fMeasure: " + corpusTool.getFMeasureAverage());
0286 if (corpusWordCount == 0)
0287 Out.prln("<BR>No Token annotations to count words in the corpus.");
0288 else
0289 Out.prln("<BR>Overall word count: " + corpusWordCount);
0290
0291 if (hasProcessed) {
0292 Out.prln("<P>Old Processed: ");
0293 Out.prln("<BR>Overall average precision: "
0294 + corpusTool.getPrecisionAverageProc());
0295 Out.prln("<BR>Overall average recall: "
0296 + corpusTool.getRecallAverageProc());
0297 Out.prln("<BR>Overall average fMeasure: "
0298 + corpusTool.getFMeasureAverageProc());
0299 }
0300 Out.prln("<BR>Finished! <P>");
0301 Out.prln("</BODY>");
0302 Out.prln("</HTML>");
0303
0304 System.exit(0);
0305
0306 } //main
0307
0308 public void setGenerateMode(boolean mode) {
0309 isGenerateMode = mode;
0310 } //setGenerateMode
0311
0312 public boolean getGenerateMode() {
0313 return isGenerateMode;
0314 } //getGenerateMode
0315
0316 public boolean getVerboseMode() {
0317 return isVerboseMode;
0318 } //getVerboseMode
0319
0320 public void setVerboseMode(boolean mode) {
0321 isVerboseMode = mode;
0322 } //setVerboseMode
0323
0324 public void setMoreInfo(boolean mode) {
0325 isMoreInfoMode = mode;
0326 } // setMoreInfo
0327
0328 public boolean getMoreInfo() {
0329 return isMoreInfoMode;
0330 } // getMoreInfo
0331
0332 public void setDiffFeaturesList(Set features) {
0333 diffFeaturesSet = features;
0334 } // setDiffFeaturesList
0335
0336 public Set getDiffFeaturesList() {
0337 return diffFeaturesSet;
0338 } // getDiffFeaturesList
0339
0340 public void setMarkedStored(boolean mode) {
0341 isMarkedStored = mode;
0342 } // setMarkedStored
0343
0344 public boolean getMarkedStored() {
0345 return isMarkedStored;
0346 } // getMarkedStored
0347
0348 public void setMarkedClean(boolean mode) {
0349 isMarkedClean = mode;
0350 } //
0351
0352 public boolean getMarkedClean() {
0353 return isMarkedClean;
0354 } //
0355
0356 public void setMarkedDS(boolean mode) {
0357 isMarkedDS = mode;
0358 } //
0359
0360 public boolean getMarkedDS() {
0361 return isMarkedDS;
0362 } //
0363
0364 public void setApplicationFile(File newAppFile) {
0365 applicationFile = newAppFile;
0366 }
0367
0368 /**
0369 * Returns the average precision over the entire set of processed documents.
0370 * <P>
0371 * If the tool has been evaluating the original documents against the
0372 * previously-stored automatically annotated ones, then the precision
0373 * will be the average precision on those two sets. <P>
0374 * If the tool was run in -marked mode, i.e., was evaluating the stored
0375 * automatically processed ones against the human-annotated ones, then
0376 * the precision will be the average precision on those two sets of documents.
0377 */
0378 public double getPrecisionAverage() {
0379 return (double) precisionSum / docNumber;
0380 }
0381
0382 /**
0383 * Returns the average recall over the entire set of processed documents.
0384 * <P>
0385 * If the tool has been evaluating the original documents against the
0386 * previously-stored automatically annotated ones, then the recall
0387 * will be the average recall on those two sets. <P>
0388 * If the tool was run in -marked mode, i.e., was evaluating the stored
0389 * automatically processed ones against the human-annotated ones, then
0390 * the recall will be the average recall on those two sets of documents.
0391 */
0392 public double getRecallAverage() {
0393 return (double) recallSum / docNumber;
0394 }
0395
0396 public double getFMeasureAverage() {
0397 return (double) fMeasureSum / docNumber;
0398 }
0399
0400 /** For processed documents */
0401 public double getPrecisionAverageProc() {
0402 return (double) proc_precisionSum / docNumber;
0403 }
0404
0405 public double getRecallAverageProc() {
0406 return (double) proc_recallSum / docNumber;
0407 }
0408
0409 public double getFMeasureAverageProc() {
0410 return (double) proc_fMeasureSum / docNumber;
0411 }
0412
0413 public boolean isGenerateMode() {
0414 return isGenerateMode == true;
0415 } //isGenerateMode
0416
0417 public double getThreshold() {
0418 return threshold;
0419 }
0420
0421 public void setThreshold(double newValue) {
0422 threshold = newValue;
0423 }
0424
0425 public File getStartDirectory() {
0426 return startDir;
0427 } //getStartDirectory
0428
0429 public void setStartDirectory(File dir) {
0430 startDir = dir;
0431 } //setStartDirectory
0432
0433 protected void generateCorpus(File fileDir, File outputDir) {
0434 //1. check if we have input files
0435 if (fileDir == null)
0436 return;
0437 //2. create the output directory or clean it up if needed
0438 File outDir = outputDir;
0439 if (outputDir == null) {
0440 outDir = new File(currDir, PROCESSED_DIR_NAME);
0441 }
0442 else {
0443 // get rid of the directory, coz datastore wants it clean
0444 if (!Files.rmdir(outDir))
0445 Out.prln("cannot delete old output directory: " + outDir);
0446 }
0447 outDir.mkdir();
0448
0449 //create the datastore and process each document
0450 try {
0451 SerialDataStore sds = new SerialDataStore(outDir.toURI().toURL().toString());
0452 sds.create();
0453 sds.open();
0454
0455 File[] files = fileDir.listFiles();
0456 for (int i = 0; i < files.length; i++) {
0457 if (!files[i].isFile())
0458 continue;
0459 // create a document
0460 Out.prln("Processing and storing document: " + files[i].toURI().toURL() + "<P>");
0461
0462 FeatureMap params = Factory.newFeatureMap();
0463 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURI().toURL());
0464 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0465
0466 FeatureMap features = Factory.newFeatureMap();
0467 // Gate.setHiddenAttribute(features, true);
0468
0469 // create the document
0470 final Document doc = (Document) Factory.createResource(
0471 "gate.corpora.DocumentImpl", params, features
0472 );
0473
0474 doc.setName(files[i].getName());
0475 if (doc == null)
0476 continue;
0477 processDocument(doc);
0478 final LanguageResource lr = sds.adopt(doc, null);
0479 sds.sync(lr);
0480 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0481 public void run() {
0482 Factory.deleteResource(doc);
0483 Factory.deleteResource(lr);
0484 }
0485 });
0486 } //for
0487 sds.close();
0488 }
0489 catch (java.net.MalformedURLException ex) {
0490 throw (GateRuntimeException)
0491 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0492 .initCause(ex);
0493 }
0494 catch (PersistenceException ex1) {
0495 throw (GateRuntimeException)
0496 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0497 .initCause(ex1);
0498 }
0499 catch (ResourceInstantiationException ex2) {
0500 throw (GateRuntimeException)
0501 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0502 .initCause(ex2);
0503 }
0504 catch (gate.security.SecurityException ex3) {
0505 throw (GateRuntimeException)
0506 new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage())
0507 .initCause(ex3);
0508 }
0509 } //generateCorpus
0510
0511 protected void evaluateCorpus(File fileDir,
0512 File processedDir, File markedDir,
0513 File errorDir) {
0514 //1. check if we have input files and the processed Dir
0515 if (fileDir == null || !fileDir.exists())
0516 return;
0517 if (processedDir == null || !processedDir.exists())
0518
0519 //if the user wants evaluation of marked and stored that's not possible
0520 if (isMarkedStored) {
0521 Out.prln("Cannot evaluate because no processed documents exist.");
0522 return;
0523 }
0524 else
0525 isMarkedClean = true;
0526
0527 // create the error directory or clean it up if needed
0528 File errDir = null;
0529 if (isMoreInfoMode) {
0530 errDir = errorDir;
0531 if (errDir == null) {
0532 errDir = new File(currDir, ERROR_DIR_NAME);
0533 }
0534 else {
0535 // get rid of the directory, coz we wants it clean
0536 if (!Files.rmdir(errDir))
0537 Out.prln("cannot delete old error directory: " + errDir);
0538 }
0539 Out.prln("Create error directory: " + errDir + "<BR><BR>");
0540 errDir.mkdir();
0541 }
0542
0543 //looked for marked texts only if the directory exists
0544 boolean processMarked = markedDir != null && markedDir.exists();
0545 if (!processMarked && (isMarkedStored || isMarkedClean)) {
0546 Out.prln("Cannot evaluate because no human-annotated documents exist.");
0547 return;
0548 }
0549
0550 if (isMarkedStored) {
0551 evaluateMarkedStored(markedDir, processedDir, errDir);
0552 return;
0553 }
0554 else if (isMarkedClean) {
0555 evaluateMarkedClean(markedDir, fileDir, errDir);
0556 return;
0557 }
0558
0559 Document persDoc = null;
0560 Document cleanDoc = null;
0561 Document markedDoc = null;
0562
0563 //open the datastore and process each document
0564 try {
0565 //open the data store
0566 DataStore sds = Factory.openDataStore
0567 ("gate.persist.SerialDataStore",
0568 processedDir.toURI().toURL().toExternalForm());
0569
0570 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0571 for (int i = 0; i < lrIDs.size(); i++) {
0572 String docID = (String) lrIDs.get(i);
0573
0574 //read the stored document
0575 FeatureMap features = Factory.newFeatureMap();
0576 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0577 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0578 FeatureMap hparams = Factory.newFeatureMap();
0579 // Gate.setHiddenAttribute(hparams, true);
0580
0581 persDoc = (Document) Factory.createResource(
0582 "gate.corpora.DocumentImpl",
0583 features, hparams);
0584
0585 if (isMoreInfoMode) {
0586 StringBuffer errName = new StringBuffer(persDoc.getName());
0587 errName.replace(
0588 persDoc.getName().lastIndexOf("."),
0589 persDoc.getName().length(),
0590 ".err");
0591 Out.prln("<H2>" +
0592 "<a href=\"err/" + errName.toString() + "\">"
0593 + persDoc.getName() + "</a>" + "</H2>");
0594 }
0595 else
0596 Out.prln("<H2>" + persDoc.getName() + "</H2>");
0597
0598 File cleanDocFile = new File(fileDir, persDoc.getName());
0599 //try reading the original document from clean
0600 if (!cleanDocFile.exists()) {
0601 Out.prln("Warning: Cannot find original document " +
0602 persDoc.getName() + " in " + fileDir);
0603 }
0604 else {
0605 FeatureMap params = Factory.newFeatureMap();
0606 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURI().toURL());
0607 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0608 documentEncoding);
0609
0610 // create the document
0611 cleanDoc = (Document) Factory.createResource(
0612 "gate.corpora.DocumentImpl", params, hparams);
0613 cleanDoc.setName(persDoc.getName());
0614 }
0615
0616 //try finding the marked document
0617 StringBuffer docName = new StringBuffer(persDoc.getName());
0618 if (!isMarkedDS) {
0619 docName.replace(
0620 persDoc.getName().lastIndexOf("."),
0621 docName.length(),
0622 ".xml");
0623 File markedDocFile = new File(markedDir, docName.toString());
0624 if (!processMarked || !markedDocFile.exists()) {
0625 Out.prln("Warning: Cannot find human-annotated document " +
0626 markedDocFile + " in " + markedDir);
0627 }
0628 else {
0629 FeatureMap params = Factory.newFeatureMap();
0630 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0631 markedDocFile.toURI().toURL());
0632 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0633 documentEncoding);
0634
0635 // create the document
0636 markedDoc = (Document) Factory.createResource(
0637 "gate.corpora.DocumentImpl", params, hparams);
0638 markedDoc.setName(persDoc.getName());
0639 }
0640 }
0641 else {
0642 //open marked from a DS
0643 //open the data store
0644 DataStore sds1 = Factory.openDataStore
0645 ("gate.persist.SerialDataStore",
0646 markedDir.toURI().toURL().toExternalForm());
0647
0648 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0649 boolean found = false;
0650 int k = 0;
0651 //search for the marked doc with the same name
0652 while (k < lrIDs1.size() && !found) {
0653 String docID1 = (String) lrIDs1.get(k);
0654
0655 //read the stored document
0656 FeatureMap features1 = Factory.newFeatureMap();
0657 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0658 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0659 Document tempDoc = (Document) Factory.createResource(
0660 "gate.corpora.DocumentImpl",
0661 features1, hparams);
0662 //check whether this is our doc
0663 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
0664 endsWith(persDoc.getName())) {
0665 found = true;
0666 markedDoc = tempDoc;
0667 }
0668 else k++;
0669 }
0670 }
0671
0672 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0673
0674 if (persDoc != null) {
0675 final gate.Document pd = persDoc;
0676 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0677 public void run() {
0678 Factory.deleteResource(pd);
0679 }
0680 });
0681 }
0682 if (cleanDoc != null) {
0683 final gate.Document cd = cleanDoc;
0684 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0685 public void run() {
0686 Factory.deleteResource(cd);
0687 }
0688 });
0689 }
0690 if (markedDoc != null) {
0691 final gate.Document md = markedDoc;
0692 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0693 public void run() {
0694 Factory.deleteResource(md);
0695 }
0696 });
0697 }
0698
0699 } //for loop through saved docs
0700 sds.close();
0701 }
0702 catch (java.net.MalformedURLException ex) {
0703 throw (GateRuntimeException)
0704 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0705 .initCause(ex);
0706 }
0707 catch (PersistenceException ex1) {
0708 throw (GateRuntimeException)
0709 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0710 .initCause(ex1);
0711 }
0712 catch (ResourceInstantiationException ex2) {
0713 throw (GateRuntimeException)
0714 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0715 .initCause(ex2);
0716 }
0717
0718 } //evaluateCorpus
0719
0720 protected void evaluateMarkedStored(File markedDir, File storedDir,
0721 File errDir) {
0722 Document persDoc = null;
0723 Document cleanDoc = null;
0724 Document markedDoc = null;
0725
0726 //open the datastore and process each document
0727 try {
0728 //open the data store
0729 DataStore sds = Factory.openDataStore
0730 ("gate.persist.SerialDataStore",
0731 storedDir.toURI().toURL().toExternalForm());
0732
0733 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0734 for (int i = 0; i < lrIDs.size(); i++) {
0735 String docID = (String) lrIDs.get(i);
0736
0737 //read the stored document
0738 FeatureMap features = Factory.newFeatureMap();
0739 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0740 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0741
0742 FeatureMap hparams = Factory.newFeatureMap();
0743 // Gate.setHiddenAttribute(hparams, true);
0744
0745 persDoc = (Document) Factory.createResource(
0746 "gate.corpora.DocumentImpl",
0747 features, hparams);
0748
0749 if (isMoreInfoMode) {
0750 StringBuffer errName = new StringBuffer(persDoc.getName());
0751 errName.replace(
0752 persDoc.getName().lastIndexOf("."),
0753 persDoc.getName().length(),
0754 ".err");
0755 Out.prln("<H2>" +
0756 "<a href=\"err/" + errName.toString() + "\">"
0757 + persDoc.getName() + "</a>" + "</H2>");
0758 }
0759 else
0760 Out.prln("<H2>" + persDoc.getName() + "</H2>");
0761
0762 if (!this.isMarkedDS) { //try finding the marked document as file
0763 StringBuffer docName = new StringBuffer(persDoc.getName());
0764 docName.replace(
0765 persDoc.getName().lastIndexOf("."),
0766 docName.length(),
0767 ".xml");
0768 File markedDocFile = new File(markedDir, docName.toString());
0769 if (!markedDocFile.exists()) {
0770 Out.prln("Warning: Cannot find human-annotated document " +
0771 markedDocFile + " in " + markedDir);
0772 }
0773 else {
0774 FeatureMap params = Factory.newFeatureMap();
0775 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0776 markedDocFile.toURI().toURL());
0777 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0778 documentEncoding);
0779
0780 // create the document
0781 markedDoc = (Document) Factory.createResource(
0782 "gate.corpora.DocumentImpl", params, hparams);
0783 markedDoc.setName(persDoc.getName());
0784 } //find marked as file
0785 }
0786 else {
0787 try {
0788 //open marked from a DS
0789 //open the data store
0790 DataStore sds1 = Factory.openDataStore
0791 ("gate.persist.SerialDataStore",
0792 markedDir.toURI().toURL().toExternalForm());
0793
0794 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0795 boolean found = false;
0796 int k = 0;
0797 //search for the marked doc with the same name
0798 while (k < lrIDs1.size() && !found) {
0799 String docID1 = (String) lrIDs1.get(k);
0800
0801 //read the stored document
0802 FeatureMap features1 = Factory.newFeatureMap();
0803 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0804 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0805 Document tempDoc = (Document) Factory.createResource(
0806 "gate.corpora.DocumentImpl",
0807 features1, hparams);
0808 //check whether this is our doc
0809 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
0810 endsWith(persDoc.getName())) {
0811 found = true;
0812 markedDoc = tempDoc;
0813 }
0814 else k++;
0815 }
0816 }
0817 catch (java.net.MalformedURLException ex) {
0818 Out.prln("Error finding marked directory " +
0819 markedDir.getAbsolutePath());
0820 }
0821 catch (gate.persist.PersistenceException ex1) {
0822 Out.prln(
0823 "Error opening marked as a datastore (-marked_ds specified)");
0824 }
0825 catch (gate.creole.ResourceInstantiationException ex2) {
0826 Out.prln(
0827 "Error opening marked as a datastore (-marked_ds specified)");
0828 }
0829 }
0830
0831 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0832 if (persDoc != null) {
0833 final gate.Document pd = persDoc;
0834 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0835 public void run() {
0836 Factory.deleteResource(pd);
0837 }
0838 });
0839 }
0840 if (markedDoc != null) {
0841 final gate.Document md = markedDoc;
0842 javax.swing.SwingUtilities.invokeLater(new Runnable() {
0843 public void run() {
0844 Factory.deleteResource(md);
0845 }
0846 });
0847 }
0848
0849 } //for loop through saved docs
0850 sds.close();
0851
0852 }
0853 catch (java.net.MalformedURLException ex) {
0854 throw (GateRuntimeException)
0855 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0856 .initCause(ex);
0857 }
0858 catch (PersistenceException ex1) {
0859 throw (GateRuntimeException)
0860 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0861 .initCause(ex1);
0862 }
0863 catch (ResourceInstantiationException ex2) {
0864 throw (GateRuntimeException)
0865 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0866 .initCause(ex2);
0867 }
0868
0869 } //evaluateMarkedStored
0870
0871 protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
0872 Document persDoc = null;
0873 Document cleanDoc = null;
0874 Document markedDoc = null;
0875
0876 File[] cleanDocs = cleanDir.listFiles();
0877 for (int i = 0; i < cleanDocs.length; i++) {
0878 if (!cleanDocs[i].isFile())
0879 continue;
0880
0881 //try reading the original document from clean
0882 FeatureMap params = Factory.newFeatureMap();
0883 try {
0884 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURI().toURL());
0885 }
0886 catch (java.net.MalformedURLException ex) {
0887 Out.prln("Cannot create document from file: " +
0888 cleanDocs[i].getAbsolutePath());
0889 continue;
0890 }
0891 //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0892 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0893
0894 FeatureMap hparams = Factory.newFeatureMap();
0895 // Gate.setHiddenAttribute(hparams, true);
0896
0897 // create the document
0898 try {
0899 cleanDoc = (Document) Factory.createResource(
0900 "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
0901 }
0902 catch (gate.creole.ResourceInstantiationException ex) {
0903 Out.prln("Cannot create document from file: " +
0904 cleanDocs[i].getAbsolutePath());
0905 continue;
0906 }
0907
0908 if (isMoreInfoMode) {
0909 StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
0910 errName.replace(
0911 cleanDocs[i].getName().lastIndexOf("."),
0912 cleanDocs[i].getName().length(),
0913 ".err");
0914 Out.prln("<H2>" +
0915 "<a href=\"err/" + errName.toString() + "\">"
0916 + cleanDocs[i].getName() + "</a>" + "</H2>");
0917 }
0918 else
0919 Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
0920
0921 //try finding the marked document
0922 if (!isMarkedDS) {
0923 StringBuffer docName = new StringBuffer(cleanDoc.getName());
0924 docName.replace(
0925 cleanDoc.getName().lastIndexOf("."),
0926 docName.length(),
0927 ".xml");
0928 File markedDocFile = new File(markedDir, docName.toString());
0929 if (!markedDocFile.exists()) {
0930 Out.prln("Warning: Cannot find human-annotated document " +
0931 markedDocFile + " in " + markedDir);
0932 continue;
0933 }
0934 else {
0935 params = Factory.newFeatureMap();
0936 try {
0937 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0938 markedDocFile.toURI().toURL());
0939 }
0940 catch (java.net.MalformedURLException ex) {
0941 Out.prln("Cannot create document from file: " +
0942 markedDocFile.getAbsolutePath());
0943 continue;
0944 }
0945 //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0946 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0947
0948 // create the document
0949 try {
0950 markedDoc = (Document) Factory.createResource(
0951 "gate.corpora.DocumentImpl", params,
0952 hparams, cleanDoc.getName());
0953 }
0954 catch (gate.creole.ResourceInstantiationException ex) {
0955 Out.prln("Cannot create document from file: " +
0956 markedDocFile.getAbsolutePath());
0957 continue;
0958 }
0959
0960 } //if markedDoc exists
0961 }
0962 else {
0963 try {
0964 //open marked from a DS
0965 //open the data store
0966 DataStore sds1 = Factory.openDataStore
0967 ("gate.persist.SerialDataStore",
0968 markedDir.toURI().toURL().toExternalForm());
0969
0970 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0971 boolean found = false;
0972 int k = 0;
0973 //search for the marked doc with the same name
0974 while (k < lrIDs1.size() && !found) {
0975 String docID1 = (String) lrIDs1.get(k);
0976
0977 //read the stored document
0978 FeatureMap features1 = Factory.newFeatureMap();
0979 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0980 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0981 Document tempDoc = (Document) Factory.createResource(
0982 "gate.corpora.DocumentImpl",
0983 features1, hparams);
0984 //check whether this is our doc
0985 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
0986 endsWith(cleanDoc.getName())) {
0987 found = true;
0988 markedDoc = tempDoc;
0989 }
0990 else k++;
0991 }
0992 }
0993 catch (java.net.MalformedURLException ex) {
0994 Out.prln("Error finding marked directory " +
0995 markedDir.getAbsolutePath());
0996 }
0997 catch (gate.persist.PersistenceException ex1) {
0998 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
0999 }
1000 catch (gate.creole.ResourceInstantiationException ex2) {
1001 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1002 }
1003 } //if using a DS for marked
1004
1005 try {
1006 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
1007 }
1008 catch (gate.creole.ResourceInstantiationException ex) {
1009 ex.printStackTrace();
1010 Out.prln("Evaluate failed on document: " + cleanDoc.getName());
1011 }
1012 if (persDoc != null) {
1013 final gate.Document pd = persDoc;
1014 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1015 public void run() {
1016 Factory.deleteResource(pd);
1017 }
1018 });
1019 }
1020 if (cleanDoc != null) {
1021 final gate.Document cd = cleanDoc;
1022 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1023 public void run() {
1024 Factory.deleteResource(cd);
1025 }
1026 });
1027 }
1028 if (markedDoc != null) {
1029 final gate.Document md = markedDoc;
1030 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1031 public void run() {
1032 Factory.deleteResource(md);
1033 }
1034 });
1035 }
1036
1037 } //for loop through clean docs
1038
1039 } //evaluateMarkedClean
1040
1041 protected void processDocument(Document doc) {
1042 try {
1043 if (application instanceof CorpusController) {
1044 Corpus tempCorpus = Factory.newCorpus("temp");
1045 tempCorpus.add(doc);
1046 ( (CorpusController) application).setCorpus(tempCorpus);
1047 application.execute();
1048 Factory.deleteResource(tempCorpus);
1049 tempCorpus = null;
1050 }
1051 else {
1052 Iterator iter = application.getPRs().iterator();
1053 while (iter.hasNext())
1054 ( (ProcessingResource) iter.next()).setParameterValue("document", doc);
1055 application.execute();
1056 }
1057 }
1058 catch (ResourceInstantiationException ex) {
1059 throw (RuntimeException)
1060 new RuntimeException("Error executing application: "
1061 + ex.getMessage())
1062 .initCause(ex);
1063 }
1064 catch (ExecutionException ex) {
1065 throw (RuntimeException)
1066 new RuntimeException("Error executing application: "
1067 + ex.getMessage())
1068 .initCause(ex);
1069 }
1070 }
1071
1072 protected void evaluateDocuments(Document persDoc,
1073 Document cleanDoc, Document markedDoc,
1074 File errDir) throws
1075 ResourceInstantiationException {
1076 if (cleanDoc == null && markedDoc == null)
1077 return;
1078
1079 //we've got no types to compare
1080 if (annotTypes == null || annotTypes.isEmpty())
1081 return;
1082
1083 if (cleanDoc != null && !isMarkedStored) {
1084
1085 processDocument(cleanDoc);
1086
1087 int wordCount = countWords(cleanDoc);
1088 if (wordCount == 0)
1089 Out.prln("<BR>No Token annotations to count words in the document.");
1090 else
1091 Out.prln("<BR>Word count: " + wordCount);
1092 corpusWordCount += wordCount;
1093
1094 if (!isMarkedClean)
1095 evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
1096 else
1097 evaluateTwoDocs(markedDoc, cleanDoc, errDir);
1098
1099 }
1100 else
1101 evaluateTwoDocs(markedDoc, persDoc, errDir);
1102
1103 }
1104
1105 /**
1106 * Count all Token.kind=word annotations in the document
1107 */
1108 protected int countWords(Document annotDoc) {
1109 int count = 0;
1110
1111 if (annotDoc == null)return 0;
1112 // check for Token in outputSetName
1113 AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
1114 if (tokens == null)return 0;
1115
1116 Iterator<Annotation> it = tokens.iterator();
1117 Annotation currAnnotation;
1118 while (it.hasNext()) {
1119 currAnnotation = it.next();
1120 Object feature = currAnnotation.getFeatures().get("kind");
1121 if (feature != null && "word".equalsIgnoreCase( (String) feature))++count;
1122 } // while
1123
1124 return count;
1125 }
1126
1127 protected void evaluateAllThree(Document persDoc,
1128 Document cleanDoc, Document markedDoc,
1129 File errDir) throws
1130 ResourceInstantiationException {
1131 //first start the table and its header
1132 printTableHeader();
1133
1134 // store annotation diff in .err file
1135 Writer errWriter = null;
1136 if (isMoreInfoMode && errDir != null) {
1137 StringBuffer docName = new StringBuffer(cleanDoc.getName());
1138 docName.replace(
1139 cleanDoc.getName().lastIndexOf("."),
1140 docName.length(),
1141 ".err");
1142 File errFile = new File(errDir, docName.toString());
1143 String encoding = ( (gate.corpora.DocumentImpl) cleanDoc).getEncoding();
1144 try {
1145 errWriter = new FileWriter(errFile, false);
1146 /*
1147 if(encoding == null) {
1148 errWriter = new OutputStreamWriter(
1149 new FileOutputStream(errFile, false));
1150 } else {
1151 errWriter = new OutputStreamWriter(
1152 new FileOutputStream(errFile, false), encoding);
1153 }*/
1154 }
1155 catch (Exception ex) {
1156 Out.prln("Exception when creating the error file " + errFile + ": "
1157 + ex.getMessage());
1158 errWriter = null;
1159 }
1160 }
1161
1162 for (int jj = 0; jj < annotTypes.size(); jj++) {
1163 String annotType = (String) annotTypes.get(jj);
1164
1165 AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1166 //we don't have this annotation type in this document
1167 if (annotDiffer == null)
1168 continue;
1169
1170 //increase the number of processed documents
1171 docNumber++;
1172 //add precison and recall to the sums
1173 updateStatistics(annotDiffer, annotType);
1174
1175 AnnotationDiffer annotDiffer1 =
1176 measureDocs(markedDoc, persDoc, annotType);
1177
1178 Out.prln("<TR>");
1179
1180 if (isMoreInfoMode && annotDiffer1 != null
1181 &&
1182 (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1183 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1184 )
1185 Out.prln("<TD> " + annotType + "_new" + "</TD>");
1186 else
1187 Out.prln("<TD> " + annotType + "</TD>");
1188
1189 if (isMoreInfoMode) {
1190 if (annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1191
1192 Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1193 Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1194 Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1195 Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1196 }
1197
1198 Out.prln("<TD>");
1199
1200 //check the precision first
1201 if (annotDiffer1 != null) {
1202
1203 if (annotDiffer1.getPrecisionAverage()
1204 < annotDiffer.getPrecisionAverage()) {
1205 Out.prln("<P><Font color=blue> ");
1206 Out.prln(annotDiffer.getPrecisionAverage());
1207
1208 if (!isMoreInfoMode) {
1209 Out.pr("<BR>Precision increase on human-marked from ");
1210 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1211 Out.prln(annotDiffer.getPrecisionAverage());
1212 }
1213 Out.prln(" </Font></P>");
1214 }
1215 else if (annotDiffer1.getPrecisionAverage()
1216 > annotDiffer.getPrecisionAverage()) {
1217 Out.prln("<P><Font color=red> ");
1218 Out.prln(annotDiffer.getPrecisionAverage());
1219
1220 if (!isMoreInfoMode) {
1221 Out.pr("<BR>Precision decrease on human-marked from ");
1222 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1223 Out.prln(annotDiffer.getPrecisionAverage());
1224 }
1225 Out.prln(" </Font></P>");
1226 }
1227 else
1228 Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() +
1229 " </P>");
1230 }
1231 else
1232 Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1233
1234 Out.prln("</TD>");
1235
1236 Out.prln("<TD>");
1237
1238 //check the recall now
1239 if (annotDiffer1 != null) {
1240
1241 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1242 Out.prln("<P><Font color=blue> ");
1243 Out.prln(annotDiffer.getRecallAverage());
1244
1245 if (!isMoreInfoMode) {
1246 Out.pr("<BR>Recall increase on human-marked from ");
1247 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1248 Out.prln(annotDiffer.getRecallAverage());
1249 }
1250 Out.prln(" </Font></P>");
1251 }
1252 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1253 Out.prln("<P><Font color=red> ");
1254 Out.prln(annotDiffer.getRecallAverage());
1255
1256 if (!isMoreInfoMode) {
1257 Out.pr("<BR>Recall decrease on human-marked from ");
1258 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1259 Out.prln(annotDiffer.getRecallAverage());
1260 }
1261 Out.prln(" </Font></P>");
1262 }
1263 else
1264 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1265 }
1266 else
1267 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1268
1269 Out.prln("</TD>");
1270
1271 //check the recall now
1272 if (isVerboseMode) {
1273 Out.prln("<TD>");
1274 if (annotDiffer.getRecallAverage() < threshold
1275 || annotDiffer.getPrecisionAverage() < threshold) {
1276 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1277 }
1278 else {
1279 Out.prln(" ");
1280 }
1281 Out.prln("</TD>");
1282 }
1283
1284 Out.prln("</TR>");
1285
1286 // show one more table line for processed document
1287 if (isMoreInfoMode && annotDiffer1 != null
1288 &&
1289 (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1290 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1291 ) {
1292
1293 Out.prln("<TR>");
1294 Out.prln("<TD> " + annotType + "_old" + "</TD>");
1295
1296 Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1297 Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1298 Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1299 Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1300
1301 Out.prln("<TD>");
1302 if (annotDiffer1.getPrecisionAverage() <
1303 annotDiffer.getPrecisionAverage())
1304
1305 Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1306 + "</Font></P>");
1307 else if (annotDiffer1.getPrecisionAverage() >
1308 annotDiffer.getPrecisionAverage())
1309 Out.prln(
1310 "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1311 + " </Font></P>");
1312 else
1313 Out.prln(annotDiffer1.getPrecisionAverage());
1314
1315 Out.prln("</TD>");
1316
1317 Out.prln("<TD>");
1318 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1319 Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1320 + " </Font></P>");
1321 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1322 Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1323 + " </Font></P>");
1324 else
1325 Out.prln(annotDiffer1.getRecallAverage());
1326
1327 Out.prln("</TD>");
1328
1329 //check the recall now
1330 if (isVerboseMode) {
1331 // create error file and start writing
1332
1333 Out.prln("<TD>");
1334 if (annotDiffer.getRecallAverage() < threshold
1335 || annotDiffer.getPrecisionAverage() < threshold) {
1336 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1337 }
1338 else {
1339 Out.prln(" ");
1340 }
1341 Out.prln("</TD>");
1342 }
1343 Out.prln("</TR>");
1344 } // if(isMoreInfoMode && annotDiff1 != null)
1345
1346 if (isMoreInfoMode && errDir != null)
1347 storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1348 } //for loop through annotation types
1349 Out.prln("</TABLE>");
1350
1351 try {
1352 if (errWriter != null)
1353 errWriter.close();
1354 }
1355 catch (Exception ex) {
1356 Out.prln("Exception on close of error file " + errWriter + ": "
1357 + ex.getMessage());
1358 }
1359 } //evaluateAllThree
1360
1361 protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1362 File errDir) throws
1363 ResourceInstantiationException {
1364
1365 //first start the table and its header
1366 printTableHeader();
1367
1368 // store annotation diff in .err file
1369 Writer errWriter = null;
1370 if (isMoreInfoMode && errDir != null) {
1371 StringBuffer docName = new StringBuffer(keyDoc.getName());
1372 docName.replace(
1373 keyDoc.getName().lastIndexOf("."),
1374 docName.length(),
1375 ".err");
1376 File errFile = new File(errDir, docName.toString());
1377 String encoding = ( (gate.corpora.DocumentImpl) keyDoc).getEncoding();
1378 try {
1379 errWriter = new FileWriter(errFile, false);
1380 /*
1381 if(encoding == null) {
1382 errWriter = new OutputStreamWriter(
1383 new FileOutputStream(errFile, false));
1384 } else {
1385 errWriter = new OutputStreamWriter(
1386 new FileOutputStream(errFile, false), encoding);
1387 }*/
1388 }
1389 catch (Exception ex) {
1390 Out.prln("Exception when creating the error file " + errFile + ": "
1391 + ex.getMessage());
1392 errWriter = null;
1393 }
1394 }
1395
1396 for (int jj = 0; jj < annotTypes.size(); jj++) {
1397 String annotType = (String) annotTypes.get(jj);
1398
1399 AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1400 //we don't have this annotation type in this document
1401 if (annotDiff == null)
1402 continue;
1403
1404 //increase the number of processed documents
1405 docNumber++;
1406 //add precison and recall to the sums
1407 updateStatistics(annotDiff, annotType);
1408
1409 Out.prln("<TR>");
1410 Out.prln("<TD>" + annotType + "</TD>");
1411
1412 if (isMoreInfoMode) {
1413 Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1414 Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1415 Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1416 Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1417 }
1418
1419 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1420 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1421 //check the recall now
1422 if (isVerboseMode) {
1423 Out.prln("<TD>");
1424 if (annotDiff.getRecallAverage() < threshold
1425 || annotDiff.getPrecisionAverage() < threshold) {
1426 printAnnotations(annotDiff, keyDoc, respDoc);
1427 }
1428 else {
1429 Out.prln(" ");
1430 }
1431 Out.prln("</TD>");
1432 }
1433 Out.prln("</TR>");
1434
1435 if (isMoreInfoMode && errDir != null)
1436 storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1437 } //for loop through annotation types
1438 Out.prln("</TABLE>");
1439
1440 try {
1441 if (errWriter != null)
1442 errWriter.close();
1443 }
1444 catch (Exception ex) {
1445 Out.prln("Exception on close of error file " + errWriter + ": "
1446 + ex.getMessage());
1447 }
1448 } //evaluateTwoDocs
1449
1450 protected void printTableHeader() {
1451 Out.prln("<TABLE BORDER=1");
1452 Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1453
1454 if (isMoreInfoMode)
1455 Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1456 + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1457
1458 Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1459
1460 if (isVerboseMode)
1461 Out.pr("<TD><B>Annotations</B></TD>");
1462
1463 Out.prln("</TR>");
1464 }
1465
1466 protected void updateStatistics(AnnotationDiffer annotDiffer,
1467 String annotType) {
1468 double precisionAverage = ( (double) ( (double) annotDiffer.
1469 getPrecisionLenient() +
1470 annotDiffer.getPrecisionStrict()) /
1471 (double) (2.0));
1472 if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1473 precisionSum += precisionAverage;
1474
1475 double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1476 annotDiffer.getRecallStrict()) /
1477 (double) (2.0));
1478 if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1479 recallSum += recallAverage;
1480
1481 double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1482 annotDiffer.getFMeasureStrict(1.0)) /
1483 (double) (2.0));
1484 if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1485 fMeasureSum += fMeasureAverage;
1486
1487 Double oldPrecision = (Double) precisionByType.get(annotType);
1488 if (oldPrecision == null)
1489 precisionByType.put(annotType, new Double(precisionAverage));
1490 else
1491 precisionByType.put(annotType,
1492 new Double(oldPrecision.doubleValue() + precisionAverage));
1493
1494 Integer precCount = (Integer) prCountByType.get(annotType);
1495 if (precCount == null)
1496 prCountByType.put(annotType, new Integer(1));
1497 else
1498 prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1499
1500 Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1501 if (oldFMeasure == null)
1502 fMeasureByType.put(annotType, new Double(fMeasureAverage));
1503 else
1504 fMeasureByType.put(annotType,
1505 new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1506
1507 Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1508 if (fCount == null)
1509 fMeasureCountByType.put(annotType, new Integer(1));
1510 else
1511 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1512
1513 Double oldRecall = (Double) recallByType.get(annotType);
1514 if (oldRecall == null)
1515 recallByType.put(annotType, new Double(recallAverage));
1516 else
1517 recallByType.put(annotType,
1518 new Double(oldRecall.doubleValue() + recallAverage));
1519
1520 Integer recCount = (Integer) recCountByType.get(annotType);
1521 if (recCount == null)
1522 recCountByType.put(annotType, new Integer(1));
1523 else
1524 recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1525
1526 //Update the missing, spurious, correct, and partial counts
1527 Long oldMissingNo = (Long) missingByType.get(annotType);
1528 if (oldMissingNo == null)
1529 missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1530 else
1531 missingByType.put(annotType,
1532 new Long(oldMissingNo.longValue() +
1533 annotDiffer.getMissing()));
1534
1535 Long oldCorrectNo = (Long) correctByType.get(annotType);
1536 if (oldCorrectNo == null)
1537 correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1538 else
1539 correctByType.put(annotType,
1540 new Long(oldCorrectNo.longValue() +
1541 annotDiffer.getCorrectMatches()));
1542
1543 Long oldPartialNo = (Long) partialByType.get(annotType);
1544 if (oldPartialNo == null)
1545 partialByType.put(annotType,
1546 new Long(annotDiffer.getPartiallyCorrectMatches()));
1547 else
1548 partialByType.put(annotType,
1549 new Long(oldPartialNo.longValue() +
1550 annotDiffer.getPartiallyCorrectMatches()));
1551
1552 Long oldSpuriousNo = (Long) spurByType.get(annotType);
1553 if (oldSpuriousNo == null)
1554 spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1555 else
1556 spurByType.put(annotType,
1557 new Long(oldSpuriousNo.longValue() +
1558 annotDiffer.getSpurious()));
1559 }
1560
1561 /**
1562 * Update statistics for processed documents
1563 * The same procedure as updateStatistics with different hashTables
1564 */
1565 protected void updateStatisticsProc(AnnotationDiffer annotDiffer,
1566 String annotType) {
1567 hasProcessed = true;
1568 double precisionAverage = ( (double) (annotDiffer.getPrecisionLenient() +
1569 annotDiffer.getPrecisionStrict()) /
1570 (double) (2.0));
1571 if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1572 proc_precisionSum += precisionAverage;
1573
1574 double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1575 annotDiffer.getRecallStrict()) /
1576 (double) (2.0));
1577 if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1578 proc_recallSum += recallAverage;
1579
1580 double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1581 annotDiffer.getFMeasureStrict(1.0)) /
1582 (double) (2.0));
1583 if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1584 proc_fMeasureSum += fMeasureAverage;
1585
1586 Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1587 if (oldPrecision == null)
1588 proc_precisionByType.put(annotType, new Double(precisionAverage));
1589 else
1590 proc_precisionByType.put(annotType,
1591 new Double(oldPrecision.doubleValue() +
1592 precisionAverage));
1593 Integer precCount = (Integer) proc_prCountByType.get(annotType);
1594 if (precCount == null)
1595 proc_prCountByType.put(annotType, new Integer(1));
1596 else
1597 proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1598
1599 Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1600 if (oldFMeasure == null)
1601 proc_fMeasureByType.put(annotType,
1602 new Double(fMeasureAverage));
1603 else
1604 proc_fMeasureByType.put(annotType,
1605 new Double(oldFMeasure.doubleValue() +
1606 fMeasureAverage));
1607 Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1608 if (fCount == null)
1609 proc_fMeasureCountByType.put(annotType, new Integer(1));
1610 else
1611 proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1612
1613 Double oldRecall = (Double) proc_recallByType.get(annotType);
1614 if (oldRecall == null)
1615 proc_recallByType.put(annotType,
1616 new Double(recallAverage));
1617 else
1618 proc_recallByType.put(annotType,
1619 new Double(oldRecall.doubleValue() +
1620 recallAverage));
1621 Integer recCount = (Integer) proc_recCountByType.get(annotType);
1622 if (recCount == null)
1623 proc_recCountByType.put(annotType, new Integer(1));
1624 else
1625 proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1626
1627 //Update the missing, spurious, correct, and partial counts
1628 Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1629 if (oldMissingNo == null)
1630 proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1631 else
1632 proc_missingByType.put(annotType,
1633 new Long(oldMissingNo.longValue() +
1634 annotDiffer.getMissing()));
1635
1636 Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1637 if (oldCorrectNo == null)
1638 proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1639 else
1640 proc_correctByType.put(annotType,
1641 new Long(oldCorrectNo.longValue() +
1642 annotDiffer.getCorrectMatches()));
1643
1644 Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1645 if (oldPartialNo == null)
1646 proc_partialByType.put(annotType,
1647 new Long(annotDiffer.getPartiallyCorrectMatches()));
1648 else
1649 proc_partialByType.put(annotType,
1650 new Long(oldPartialNo.longValue() +
1651 annotDiffer.getPartiallyCorrectMatches()));
1652
1653 Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1654 if (oldSpuriousNo == null)
1655 proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1656 else
1657 proc_spurByType.put(annotType,
1658 new Long(oldSpuriousNo.longValue() +
1659 annotDiffer.getSpurious()));
1660 }
1661
1662 public void printStatistics() {
1663
1664 Out.prln("<H2> Statistics </H2>");
1665
1666 /*
1667 Out.prln("<H3> Precision </H3>");
1668 if (precisionByType != null && !precisionByType.isEmpty()) {
1669 Iterator iter = precisionByType.keySet().iterator();
1670 while (iter.hasNext()) {
1671 String annotType = (String) iter.next();
1672 Out.prln(annotType + ": "
1673 + ((Double)precisionByType.get(annotType)).doubleValue()
1674 /
1675 ((Integer)prCountByType.get(annotType)).intValue()
1676 + "<P>");
1677 }//while
1678 }
1679 Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1680
1681 Out.prln("<H3> Recall </H3>");
1682 if (recallByType != null && !recallByType.isEmpty()) {
1683 Iterator iter = recallByType.keySet().iterator();
1684 while (iter.hasNext()) {
1685 String annotType = (String) iter.next();
1686 Out.prln(annotType + ": "
1687 + ((Double)recallByType.get(annotType)).doubleValue()
1688 /
1689 ((Integer)recCountByType.get(annotType)).intValue()
1690 + "<P>");
1691 }//while
1692 }
1693
1694 Out.prln("Overall recall: " + getRecallAverage()
1695 + "<P>");
1696 */
1697 if (annotTypes == null) {
1698 Out.prln("No types given for evaluation, cannot obtain precision/recall");
1699 return;
1700 }
1701 Out.prln("<table border=1>");
1702 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1703 "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1704 "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1705 "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1706 String annotType;
1707 for (int i = 0; i < annotTypes.size(); i++) {
1708 annotType = (String) annotTypes.get(i);
1709 printStatsForType(annotType);
1710 } //for
1711 Out.prln("</table>");
1712 } // updateStatisticsProc
1713
1714 protected void printStatsForType(String annotType) {
1715 long correct = (correctByType.get(annotType) == null) ? 0 :
1716 ( (Long) correctByType.get(annotType)).longValue();
1717 long partial = (partialByType.get(annotType) == null) ? 0 :
1718 ( (Long) partialByType.get(annotType)).longValue();
1719 long spurious = (spurByType.get(annotType) == null) ? 0 :
1720 ( (Long) spurByType.get(annotType)).longValue();
1721 long missing = (missingByType.get(annotType) == null) ? 0 :
1722 ( (Long) missingByType.get(annotType)).longValue();
1723 long actual = correct + partial + spurious;
1724 long possible = correct + partial + missing;
1725 //precision strict is correct/actual
1726 //precision is (correct + 0.5 * partially correct)/actual
1727 double precision = 0d;
1728 if (actual!=0)
1729 precision = (correct + 0.5 * partial) / actual;
1730
1731 //recall strict is correct/possible
1732 double recall = 0d;
1733 if (possible!=0)
1734 recall = (correct + 0.5 * partial) / possible;
1735
1736 //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1737 double fmeasure = 0d;
1738 if ((beta * beta * precision) + recall !=0){
1739 fmeasure =
1740 ( (beta * beta + 1) * precision * recall)
1741 /
1742 ( (beta * beta * precision) + recall);
1743 }
1744
1745 long proc_correct = 0;
1746 long proc_partial = 0;
1747 long proc_spurious = 0;
1748 long proc_missing = 0;
1749 long proc_actual = 0;
1750 long proc_possible = 0;
1751 double proc_precision = 0;
1752 double proc_recall = 0;
1753 double proc_fmeasure = 0;
1754
1755 if (hasProcessed) {
1756 // calculate values for processed
1757 proc_correct = (proc_correctByType.get(annotType) == null) ? 0 :
1758 ( (Long) proc_correctByType.get(annotType)).longValue();
1759 proc_partial = (proc_partialByType.get(annotType) == null) ? 0 :
1760 ( (Long) proc_partialByType.get(annotType)).longValue();
1761 proc_spurious = (proc_spurByType.get(annotType) == null) ? 0 :
1762 ( (Long) proc_spurByType.get(annotType)).longValue();
1763 proc_missing = (proc_missingByType.get(annotType) == null) ? 0 :
1764 ( (Long) proc_missingByType.get(annotType)).longValue();
1765 proc_actual = proc_correct + proc_partial + proc_spurious;
1766 proc_possible = proc_correct + proc_partial + proc_missing;
1767 //precision strict is correct/actual
1768 //precision is (correct + 0.5 * partially correct)/actual
1769 proc_precision = (proc_correct + 0.5 * proc_partial) / proc_actual;
1770 //recall strict is correct/possible
1771 proc_recall = (proc_correct + 0.5 * proc_partial) / proc_possible;
1772 //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1773 proc_fmeasure =
1774 ( (beta * beta + 1) * proc_precision * proc_recall)
1775 /
1776 ( (beta * beta * proc_precision) + proc_recall);
1777
1778 }
1779
1780 // output data
1781 Out.prln("<TR>");
1782 if (hasProcessed)
1783 Out.prln("<TD>" + annotType + "_new" + "</TD>");
1784 else
1785 Out.prln("<TD>" + annotType + "</TD>");
1786
1787 Out.prln("<TD>" + correct + "</TD>");
1788 Out.prln("<TD>" + partial + "</TD>");
1789 Out.prln("<TD>" + missing + "</TD>");
1790 Out.prln("<TD>" + spurious + "</TD>");
1791
1792 String strPrec = (isMoreInfoMode) ?
1793 avgPrint(precision, 4)
1794 : Double.toString(precision);
1795 String strRec = (isMoreInfoMode) ?
1796 avgPrint(recall, 4)
1797 : Double.toString(recall);
1798 String strFmes = (isMoreInfoMode) ?
1799 avgPrint(fmeasure, 4)
1800 : Double.toString(fmeasure);
1801
1802 if (hasProcessed && (precision < proc_precision))
1803 Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1804 else if (hasProcessed && (precision > proc_precision))
1805 Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1806 else
1807 Out.prln("<TD>" + strPrec + "</TD>");
1808 if (hasProcessed && (recall < proc_recall))
1809 Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1810 else if (hasProcessed && (recall > proc_recall))
1811 Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1812 else
1813 Out.prln("<TD>" + strRec + "</TD>");
1814 Out.prln("<TD>" + strFmes + "</TD>");
1815 Out.prln("</TR>");
1816
1817 if (hasProcessed) {
1818 // output data
1819 Out.prln("<TR>");
1820 Out.prln("<TD>" + annotType + "_old" + "</TD>");
1821
1822 Out.prln("<TD>" + proc_correct + "</TD>");
1823 Out.prln("<TD>" + proc_partial + "</TD>");
1824 Out.prln("<TD>" + proc_missing + "</TD>");
1825 Out.prln("<TD>" + proc_spurious + "</TD>");
1826
1827 String strProcPrec = (isMoreInfoMode) ?
1828 avgPrint(proc_precision, 4)
1829 : Double.toString(proc_precision);
1830 String strProcRec = (isMoreInfoMode) ?
1831 avgPrint(proc_recall, 4)
1832 : Double.toString(proc_recall);
1833 String strProcFmes = (isMoreInfoMode) ?
1834 avgPrint(proc_fmeasure, 4)
1835 : Double.toString(proc_fmeasure);
1836
1837 if (precision < proc_precision)
1838 Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1839 else if (precision > proc_precision)
1840 Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1841 else
1842 Out.prln("<TD>" + strProcPrec + "</TD>");
1843 if (recall < proc_recall)
1844 Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1845 else if (recall > proc_recall)
1846 Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1847 else
1848 Out.prln("<TD>" + strProcRec + "</TD>");
1849 Out.prln("<TD>" + strProcFmes + "</TD>");
1850 Out.prln("</TR>");
1851 }
1852 } //printStatsForType
1853
1854 //** Print @param value with @param count digits after decimal point */
1855 protected String avgPrint(double value, int count) {
1856 double newvalue;
1857 double power = Math.pow(10, count);
1858 newvalue = Math.round(value * power) / power;
1859 return Double.toString(newvalue);
1860 }
1861
1862 private double precisionSumCalc = 0;
1863 private double recallSumCalc = 0;
1864 private double fMeasureSumCalc = 0;
1865
1866 public double getPrecisionAverageCalc() {
1867 return precisionSumCalc;
1868 }
1869
1870 public double getRecallAverageCalc() {
1871 return recallSumCalc;
1872 }
1873
1874 public double getFmeasureAverageCalc() {
1875 return fMeasureSumCalc;
1876 }
1877
1878 protected void calculateAvgTotal() {
1879 long correct, partial, spurious, missing;
1880 long correctSum, partialSum, spuriousSum, missingSum;
1881
1882 if (annotTypes == null) {
1883 return;
1884 }
1885 correctSum = partialSum = spuriousSum = missingSum = 0;
1886
1887 String annotType;
1888 for (int i = 0; i < annotTypes.size(); i++) {
1889 annotType = (String) annotTypes.get(i);
1890 correct = (correctByType.get(annotType) == null) ? 0 :
1891 ( (Long) correctByType.get(annotType)).longValue();
1892 partial = (partialByType.get(annotType) == null) ? 0 :
1893 ( (Long) partialByType.get(annotType)).longValue();
1894 spurious = (spurByType.get(annotType) == null) ? 0 :
1895 ( (Long) spurByType.get(annotType)).longValue();
1896 missing = (missingByType.get(annotType) == null) ? 0 :
1897 ( (Long) missingByType.get(annotType)).longValue();
1898 correctSum += correct;
1899 partialSum += partial;
1900 spuriousSum += spurious;
1901 missingSum += missing;
1902 } //for
1903
1904 long actual = correctSum + partialSum + spuriousSum;
1905 long possible = correctSum + partialSum + missingSum;
1906
1907 if (actual == 0) {
1908 precisionSumCalc = 0;
1909 }
1910 else {
1911 precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1912 }
1913
1914 if (possible == 0) {
1915 recallSumCalc = 0;
1916 }
1917 else {
1918 recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1919 }
1920
1921 if (precisionSumCalc == 0 && recallSumCalc == 0) {
1922 fMeasureSumCalc = 0;
1923 }
1924 else {
1925 fMeasureSumCalc =
1926 ( (beta * beta + 1) * precisionSumCalc * recallSumCalc)
1927 /
1928 ( (beta * beta * precisionSumCalc) + recallSumCalc);
1929
1930 }
1931 } // calculateAvgTotal
1932
1933 protected AnnotationDiffer measureDocs(
1934 Document keyDoc, Document respDoc, String annotType) throws
1935 ResourceInstantiationException {
1936
1937 if (keyDoc == null || respDoc == null)
1938 return null;
1939
1940 if (annotSetName != null
1941 && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1942 return null;
1943 else if ( (annotSetName == null || annotSetName.equals(""))
1944 && keyDoc.getAnnotations().get(annotType) == null)
1945 return null;
1946
1947 // create an annotation diff
1948 AnnotationDiffer annotDiffer = new AnnotationDiffer();
1949 // set the feature names set for annotation differ
1950 annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1951 // we need to find the sets
1952 AnnotationSet keys, responses;
1953 if (annotSetName == null || annotSetName.equals("")) {
1954 keys = keyDoc.getAnnotations().get(annotType);
1955 responses = respDoc.getAnnotations().get(annotType);
1956 }
1957 else {
1958 keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1959 responses = respDoc.getAnnotations(outputSetName).get(annotType);
1960 }
1961
1962 // we have annotation sets so call the annotationDiffer
1963 List pairings = annotDiffer.calculateDiff(keys, responses);
1964 return annotDiffer;
1965 } // measureDocs
1966
1967 protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1968 Document keyDoc, Document respDoc,
1969 Writer errFileWriter) {
1970 if (errFileWriter == null)return; // exit on "no file"
1971
1972 try {
1973 // extract and store annotations
1974 Comparator comp = new OffsetComparator();
1975 TreeSet sortedSet = new TreeSet(comp);
1976 Set missingSet =
1977 annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1978 sortedSet.clear();
1979 sortedSet.addAll(missingSet);
1980 storeAnnotations(type + ".miss", sortedSet, keyDoc, errFileWriter);
1981 Set spuriousSet =
1982 annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1983 sortedSet.clear();
1984 sortedSet.addAll(spuriousSet);
1985 storeAnnotations(type + ".spur", sortedSet, respDoc, errFileWriter);
1986 Set partialSet =
1987 annotDiffer.getAnnotationsOfType(AnnotationDiffer.
1988 PARTIALLY_CORRECT_TYPE);
1989 sortedSet.clear();
1990 sortedSet.addAll(partialSet);
1991 storeAnnotations(type + ".part", sortedSet, respDoc, errFileWriter);
1992 }
1993 catch (Exception ex) {
1994 Out.prln("Exception on close of error file " + errFileWriter + ": "
1995 + ex.getMessage());
1996 }
1997 } // storeAnnotations
1998
1999 protected void storeAnnotations(String type, Set set, Document doc,
2000 Writer file) throws IOException {
2001
2002 if (set == null || set.isEmpty())
2003 return;
2004
2005 Iterator iter = set.iterator();
2006 Annotation ann;
2007 while (iter.hasNext()) {
2008 ann = (Annotation) iter.next();
2009 file.write(type);
2010 file.write(".");
2011 file.write(doc.getContent().toString().substring(
2012 ann.getStartNode().getOffset().intValue(),
2013 ann.getEndNode().getOffset().intValue()));
2014 file.write(".");
2015 file.write(ann.getStartNode().getOffset().toString());
2016 file.write(".");
2017 file.write(ann.getEndNode().getOffset().toString());
2018 file.write("\n");
2019 } //while
2020 } // storeAnnotations
2021
2022 protected void printAnnotations(AnnotationDiffer annotDiff,
2023 Document keyDoc, Document respDoc) {
2024 Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
2025 Set missingSet =
2026 annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2027 printAnnotations(missingSet, keyDoc);
2028 Out.prln("<BR>");
2029
2030 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
2031 Set spuriousSet =
2032 annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2033 printAnnotations(spuriousSet, respDoc);
2034 Out.prln("</BR>");
2035
2036 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
2037 Set partialSet =
2038 annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
2039 printAnnotations(partialSet, respDoc);
2040 }
2041
2042 protected void printAnnotations(Set set, Document doc) {
2043 if (set == null || set.isEmpty())
2044 return;
2045
2046 Iterator iter = set.iterator();
2047 while (iter.hasNext()) {
2048 Annotation ann = (Annotation) iter.next();
2049 Out.prln(
2050 "<B>" +
2051 doc.getContent().toString().substring(
2052 ann.getStartNode().getOffset().intValue(),
2053 ann.getEndNode().getOffset().intValue()) +
2054 "</B>: <I>[" + ann.getStartNode().getOffset() +
2055 "," + ann.getEndNode().getOffset() + "]</I>"
2056 // + "; features" + ann.getFeatures()
2057 );
2058 } //while
2059 } //printAnnotations
2060
2061 /**
2062 * The directory from which we should generate/evaluate the corpus
2063 */
2064 private File startDir;
2065 private File currDir;
2066 private static List annotTypes;
2067
2068 private Controller application = null;
2069 private File applicationFile = null;
2070
2071 //collect the sum of all precisions and recalls of all docs
2072 //and the number of docs, so I can calculate the average for
2073 //the corpus at the end
2074 private double precisionSum = 0.0;
2075 private double recallSum = 0.0;
2076 private double fMeasureSum = 0.0;
2077 private HashMap precisionByType = new HashMap();
2078 private HashMap prCountByType = new HashMap();
2079 private HashMap recallByType = new HashMap();
2080 private HashMap recCountByType = new HashMap();
2081 private HashMap fMeasureByType = new HashMap();
2082 private HashMap fMeasureCountByType = new HashMap();
2083
2084 private HashMap missingByType = new HashMap();
2085 private HashMap spurByType = new HashMap();
2086 private HashMap correctByType = new HashMap();
2087 private HashMap partialByType = new HashMap();
2088
2089 // statistic for processed
2090 static boolean hasProcessed = false;
2091 private double proc_precisionSum = 0;
2092 private double proc_recallSum = 0;
2093 private double proc_fMeasureSum = 0;
2094 private HashMap proc_precisionByType = new HashMap();
2095 private HashMap proc_prCountByType = new HashMap();
2096 private HashMap proc_recallByType = new HashMap();
2097 private HashMap proc_recCountByType = new HashMap();
2098 private HashMap proc_fMeasureByType = new HashMap();
2099 private HashMap proc_fMeasureCountByType = new HashMap();
2100
2101 private HashMap proc_missingByType = new HashMap();
2102 private HashMap proc_spurByType = new HashMap();
2103 private HashMap proc_correctByType = new HashMap();
2104 private HashMap proc_partialByType = new HashMap();
2105
2106 double beta = 1;
2107
2108 private int docNumber = 0;
2109
2110 /**
2111 * If true, the corpus tool will generate the corpus, otherwise it'll
2112 * run in evaluate mode
2113 */
2114 private boolean isGenerateMode = false;
2115
2116 /**
2117 * If true - show annotations for docs below threshold
2118 */
2119 private boolean isVerboseMode = false;
2120
2121 /**
2122 * If true - show more info in document table
2123 */
2124 private boolean isMoreInfoMode = false;
2125
2126 /**
2127 * The list of features used in the AnnotationDiff separated by comma
2128 * Example: "class;inst"
2129 */
2130 private Set diffFeaturesSet;
2131
2132 /**
2133 * If true, the corpus tool will evaluate stored against the human-marked
2134 * documents
2135 */
2136 private boolean isMarkedStored = false;
2137 private boolean isMarkedClean = false;
2138
2139 //whether marked are in a DS, not xml
2140 private boolean isMarkedDS = false;
2141
2142 private String annotSetName = "Key";
2143 private String outputSetName = null;
2144
2145 private double threshold = 0.5;
2146 private Properties configs = new Properties();
2147 private static int corpusWordCount = 0;
2148
2149 private String documentEncoding = "";
2150
2151 /** String to print when wrong command-line args */
2152 private static String usage =
2153 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
2154 + "[-verbose] [-moreinfo] directory-name application";
2155
2156 }
|