CorpusBenchmarkTool.java
0001 /*
0002  *  CorpusBenchmarkTool.java
0003  *
0004  *  Copyright (c) 1995-2010, The University of Sheffield. See the file
0005  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006  *
0007  *  This file is part of GATE (see http://gate.ac.uk/), and is free
0008  *  software, licenced under the GNU Library General Public License,
0009  *  Version 2, June 1991 (in the distribution as file licence.html,
0010  *  and also available at http://gate.ac.uk/gate/licence.html).
0011  *
0012  *  Kalina Bontcheva, 24/Oct/2001
0013  *
0014  *  $Id: CorpusBenchmarkTool.java 12006 2009-12-01 17:24:28Z thomas_heitz $
0015  */
0016 
0017 package gate.util;
0018 
0019 import java.io.*;
0020 import java.util.*;
0021 
0022 import gate.*;
0023 import gate.util.AnnotationDiffer;
0024 import gate.creole.*;
0025 import gate.persist.PersistenceException;
0026 import gate.persist.SerialDataStore;
0027 
0028 public class CorpusBenchmarkTool {
0029   private static final String MARKED_DIR_NAME = "marked";
0030   private static final String CLEAN_DIR_NAME = "clean";
0031   private static final String CVS_DIR_NAME = "Cvs";
0032   private static final String PROCESSED_DIR_NAME = "processed";
0033   private static final String ERROR_DIR_NAME = "err";
0034 
0035   private static final boolean DEBUG = true;
0036 
0037   public CorpusBenchmarkTool() {}
0038 
0039   public void initPRs() {
0040     try {
0041       if (applicationFile == null)
0042         Out.prln("Application not set!");
0043       Out.prln("App file is: " + applicationFile.getAbsolutePath());
0044       application = (Controllergate.util.persistence.PersistenceManager
0045                     .loadObjectFromFile(applicationFile);
0046     }
0047     catch (Exception ex) {
0048       throw (GateRuntimeException)
0049         new GateRuntimeException("Corpus Benchmark Tool:" + ex.getMessage())
0050         .initCause(ex);
0051     }
0052   //initPRs
0053 
0054   public void unloadPRs() {
0055     //we have nothing to unload if no PRs are loaded
0056     if (isMarkedStored)
0057       return;
0058 
0059   }
0060 
0061   public void execute() {
0062     execute(startDir);
0063     if (application != null) {
0064       javax.swing.SwingUtilities.invokeLater(new Runnable() {
0065         public void run() {
0066 
0067           Iterator iter = new ArrayList(application.getPRs()).iterator();
0068           while (iter.hasNext())
0069             Factory.deleteResource( (Resourceiter.next());
0070 
0071           Factory.deleteResource(application);
0072         }
0073       });
0074     }
0075   }
0076 
0077   public void init() {
0078     //first read the corpus_tool.properties file
0079     File propFile = new File("corpus_tool.properties");
0080     Out.prln(propFile.getAbsolutePath());
0081     if (propFile.exists()) {
0082       try {
0083         InputStream inputStream = new FileInputStream(propFile);
0084         this.configs.load(inputStream);
0085         String thresholdString = this.configs.getProperty("threshold");
0086         if (thresholdString != null && !thresholdString.equals("")) {
0087           thresholdString=thresholdString.trim();
0088           this.threshold = (new Double(thresholdString)).doubleValue();
0089           Out.prln("New threshold is: " this.threshold + "<P>\n");
0090         }
0091         String setName = this.configs.getProperty("annotSetName");
0092         if (setName != null && !setName.equals("")) {
0093           setName=setName.trim();
0094           Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
0095           this.annotSetName = setName;
0096         }
0097         setName = this.configs.getProperty("outputSetName");
0098         if (setName != null && !setName.equals("")) {
0099           setName=setName.trim();
0100           Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
0101           this.outputSetName = setName;
0102         }
0103         String encodingString = this.configs.getProperty("encoding");
0104         if (encodingString != null && !encodingString.equals("")) {
0105           encodingString=encodingString.trim();
0106           this.documentEncoding = encodingString;
0107           Out.prln("New encoding is: " this.documentEncoding + "<P>\n");
0108         }
0109         String types = this.configs.getProperty("annotTypes");
0110         if (types != null && !types.equals("")) {
0111           types=types.trim();
0112           Out.prln("Using annotation types from the properties file. <P>\n");
0113           StringTokenizer strTok = new StringTokenizer(types, ";");
0114           annotTypes = new ArrayList();
0115           while (strTok.hasMoreTokens())
0116             annotTypes.add(strTok.nextToken());
0117         }
0118         else {
0119           annotTypes = new ArrayList();
0120           annotTypes.add("Organization");
0121           annotTypes.add("Person");
0122           annotTypes.add("Date");
0123           annotTypes.add("Location");
0124           annotTypes.add("Address");
0125           annotTypes.add("Money");
0126           annotTypes.add("Percent");
0127           annotTypes.add("GPE");
0128           annotTypes.add("Facility");
0129         }
0130         String features = this.configs.getProperty("annotFeatures");
0131         HashSet result = new HashSet();
0132         if (features != null && !features.equals("")) {
0133           features=features.trim();
0134           Out.pr("Using annotation features from the properties file. \n");
0135           java.util.StringTokenizer tok =
0136               new java.util.StringTokenizer(features, ";");
0137           String current;
0138           while (tok.hasMoreTokens()) {
0139             current = tok.nextToken();
0140             result.add(current);
0141           // while
0142         }
0143         diffFeaturesSet = result;
0144         Out.prln("Features: " + diffFeaturesSet + " <P>\n");
0145 
0146       }
0147       catch (IOException ex) {
0148         //just ignore the file and go on with the defaults
0149         this.configs = new Properties();
0150       }
0151     }
0152     else
0153       this.configs = new Properties();
0154 
0155     //we only initialise the PRs if they are going to be used
0156     //for processing unprocessed documents
0157     if (!this.isMarkedStored)
0158       initPRs();
0159 
0160   }
0161 
0162   public void execute(File dir) {
0163     if (dir == null)
0164       return;
0165     //first set the current directory to be the given one
0166     currDir = dir;
0167 
0168     File processedDir = null;
0169     File cleanDir = null;
0170     File markedDir = null;
0171     File errorDir = null;
0172 
0173     ArrayList subDirs = new ArrayList();
0174     File[] dirArray = currDir.listFiles();
0175     if (dirArray == null)return;
0176     for (int i = 0; i < dirArray.length; i++) {
0177       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
0178         continue;
0179       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
0180         cleanDir = dirArray[i];
0181       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
0182         markedDir = dirArray[i];
0183       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
0184         processedDir = dirArray[i];
0185       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
0186         errorDir = dirArray[i];
0187       else
0188         subDirs.add(dirArray[i]);
0189     }
0190 
0191     if (cleanDir == null)return;
0192     Out.prln("Processing directory: " + currDir + "<P>");
0193 
0194     if (this.isGenerateMode)
0195       generateCorpus(cleanDir, processedDir);
0196     else
0197       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
0198 
0199       //if no more subdirs left, return
0200     if (subDirs.isEmpty())
0201       return;
0202 
0203     //there are more subdirectories to traverse, so iterate through
0204     for (int j = 0; j < subDirs.size(); j++)
0205       execute( (FilesubDirs.get(j));
0206 
0207   //execute(dir)
0208 
0209   public static void main(String[] argsthrows GateException {
0210     Out.prln("<HTML>");
0211     Out.prln("<HEAD>");
0212     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
0213     for (int argC = 0; argC < args.length; ++argC)
0214       Out.pr(args[argC" ");
0215     Out.pr(" on " new Date() "</TITLE> </HEAD>");
0216     Out.prln("<BODY>");
0217     Out.prln("Please wait while GATE tools are initialised. <P>");
0218     // initialise GATE
0219     Gate.init();
0220 
0221     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
0222 
0223     List inputFiles = null;
0224     if (args.length < 1)throw new GateException(usage);
0225     int i = 0;
0226     while (i < args.length && args[i].startsWith("-")) {
0227       if (args[i].equals("-generate")) {
0228         Out.prln("Generating the corpus... <P>");
0229         corpusTool.setGenerateMode(true);
0230       }
0231       else if (args[i].equals("-marked_clean")) {
0232         Out.prln("Evaluating current grammars against human-annotated...<P>");
0233         corpusTool.setMarkedClean(true);
0234       }
0235       else if (args[i].equals("-marked_stored")) {
0236         Out.prln("Evaluating stored documents against human-annotated...<P>");
0237         corpusTool.setMarkedStored(true);
0238       }
0239       else if (args[i].equals("-marked_ds")) {
0240         Out.prln("Looking for marked docs in a datastore...<P>");
0241         corpusTool.setMarkedDS(true);
0242       }
0243       else if (args[i].equals("-verbose")) {
0244         Out.prln("Running in verbose mode. Will generate annotation " +
0245                  "information when precision/recall are lower than " +
0246                  corpusTool.getThreshold() "<P>");
0247         corpusTool.setVerboseMode(true);
0248       }
0249       else if (args[i].equals("-moreinfo")) {
0250         Out.prln("Show more details in document table...<P>");
0251         corpusTool.setMoreInfo(true);
0252       }
0253       i++; //just ignore the option, which we do not recognise
0254     //while
0255 
0256     String dirName = args[i];
0257     File dir = new File(dirName);
0258     if (!dir.isDirectory())
0259       throw new GateException(usage);
0260 
0261     //get the last argument which is the application
0262     i++;
0263     String appName = args[i];
0264     File appFile = new File(appName);
0265     if (!appFile.isFile())
0266       throw new GateException(usage);
0267     else
0268       corpusTool.setApplicationFile(appFile);
0269 
0270     corpusTool.init();
0271     corpusWordCount = 0;
0272 
0273     Out.prln("Measuring annotaitions of types: " +
0274              CorpusBenchmarkTool.annotTypes + "<P>");
0275 
0276     corpusTool.setStartDirectory(dir);
0277     corpusTool.execute();
0278     //if we're not generating the corpus, then print the precision and recall
0279     //statistics for the processed corpus
0280     if (!corpusTool.getGenerateMode())
0281       corpusTool.printStatistics();
0282 
0283     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
0284     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
0285     Out.prln("<BR>Overall average fMeasure: " + corpusTool.getFMeasureAverage());
0286     if (corpusWordCount == 0)
0287       Out.prln("<BR>No Token annotations to count words in the corpus.");
0288     else
0289       Out.prln("<BR>Overall word count: " + corpusWordCount);
0290 
0291     if (hasProcessed) {
0292       Out.prln("<P>Old Processed: ");
0293       Out.prln("<BR>Overall average precision: "
0294                + corpusTool.getPrecisionAverageProc());
0295       Out.prln("<BR>Overall average recall: "
0296                + corpusTool.getRecallAverageProc());
0297       Out.prln("<BR>Overall average fMeasure: "
0298                + corpusTool.getFMeasureAverageProc());
0299     }
0300     Out.prln("<BR>Finished! <P>");
0301     Out.prln("</BODY>");
0302     Out.prln("</HTML>");
0303 
0304     System.exit(0);
0305 
0306   //main
0307 
0308   public void setGenerateMode(boolean mode) {
0309     isGenerateMode = mode;
0310   //setGenerateMode
0311 
0312   public boolean getGenerateMode() {
0313     return isGenerateMode;
0314   //getGenerateMode
0315 
0316   public boolean getVerboseMode() {
0317     return isVerboseMode;
0318   //getVerboseMode
0319 
0320   public void setVerboseMode(boolean mode) {
0321     isVerboseMode = mode;
0322   //setVerboseMode
0323 
0324   public void setMoreInfo(boolean mode) {
0325     isMoreInfoMode = mode;
0326   // setMoreInfo
0327 
0328   public boolean getMoreInfo() {
0329     return isMoreInfoMode;
0330   // getMoreInfo
0331 
0332   public void setDiffFeaturesList(Set features) {
0333     diffFeaturesSet = features;
0334   // setDiffFeaturesList
0335 
0336   public Set getDiffFeaturesList() {
0337     return diffFeaturesSet;
0338   // getDiffFeaturesList
0339 
0340   public void setMarkedStored(boolean mode) {
0341     isMarkedStored = mode;
0342   // setMarkedStored
0343 
0344   public boolean getMarkedStored() {
0345     return isMarkedStored;
0346   // getMarkedStored
0347 
0348   public void setMarkedClean(boolean mode) {
0349     isMarkedClean = mode;
0350   //
0351 
0352   public boolean getMarkedClean() {
0353     return isMarkedClean;
0354   //
0355 
0356   public void setMarkedDS(boolean mode) {
0357     isMarkedDS = mode;
0358   //
0359 
0360   public boolean getMarkedDS() {
0361     return isMarkedDS;
0362   //
0363 
0364   public void setApplicationFile(File newAppFile) {
0365     applicationFile = newAppFile;
0366   }
0367 
0368   /**
0369    * Returns the average precision over the entire set of processed documents.
0370    <P>
0371    * If the tool has been evaluating the original documents against the
0372    * previously-stored automatically annotated ones, then the precision
0373    * will be the average precision on those two sets. <P>
0374    * If the tool was run in -marked mode, i.e., was evaluating the stored
0375    * automatically processed ones against the human-annotated ones, then
0376    * the precision will be the average precision on those two sets of documents.
0377    */
0378   public double getPrecisionAverage() {
0379     return (doubleprecisionSum / docNumber;
0380   }
0381 
0382   /**
0383    * Returns the average recall over the entire set of processed documents.
0384    <P>
0385    * If the tool has been evaluating the original documents against the
0386    * previously-stored automatically annotated ones, then the recall
0387    * will be the average recall on those two sets. <P>
0388    * If the tool was run in -marked mode, i.e., was evaluating the stored
0389    * automatically processed ones against the human-annotated ones, then
0390    * the recall will be the average recall on those two sets of documents.
0391    */
0392   public double getRecallAverage() {
0393     return (doublerecallSum / docNumber;
0394   }
0395 
0396   public double getFMeasureAverage() {
0397     return (doublefMeasureSum / docNumber;
0398   }
0399 
0400   /** For processed documents */
0401   public double getPrecisionAverageProc() {
0402     return (doubleproc_precisionSum / docNumber;
0403   }
0404 
0405   public double getRecallAverageProc() {
0406     return (doubleproc_recallSum / docNumber;
0407   }
0408 
0409   public double getFMeasureAverageProc() {
0410     return (doubleproc_fMeasureSum / docNumber;
0411   }
0412 
0413   public boolean isGenerateMode() {
0414     return isGenerateMode == true;
0415   //isGenerateMode
0416 
0417   public double getThreshold() {
0418     return threshold;
0419   }
0420 
0421   public void setThreshold(double newValue) {
0422     threshold = newValue;
0423   }
0424 
0425   public File getStartDirectory() {
0426     return startDir;
0427   //getStartDirectory
0428 
0429   public void setStartDirectory(File dir) {
0430     startDir = dir;
0431   //setStartDirectory
0432 
0433   protected void generateCorpus(File fileDir, File outputDir) {
0434     //1. check if we have input files
0435     if (fileDir == null)
0436       return;
0437     //2. create the output directory or clean it up if needed
0438     File outDir = outputDir;
0439     if (outputDir == null) {
0440       outDir = new File(currDir, PROCESSED_DIR_NAME);
0441     }
0442     else {
0443       // get rid of the directory, coz datastore wants it clean
0444       if (!Files.rmdir(outDir))
0445         Out.prln("cannot delete old output directory: " + outDir);
0446     }
0447     outDir.mkdir();
0448 
0449     //create the datastore and process each document
0450     try {
0451       SerialDataStore sds = new SerialDataStore(outDir.toURI().toURL().toString());
0452       sds.create();
0453       sds.open();
0454 
0455       File[] files = fileDir.listFiles();
0456       for (int i = 0; i < files.length; i++) {
0457         if (!files[i].isFile())
0458           continue;
0459         // create a document
0460         Out.prln("Processing and storing document: " + files[i].toURI().toURL() "<P>");
0461 
0462         FeatureMap params = Factory.newFeatureMap();
0463         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURI().toURL());
0464         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0465 
0466         FeatureMap features = Factory.newFeatureMap();
0467 //        Gate.setHiddenAttribute(features, true);
0468 
0469         // create the document
0470         final Document doc = (DocumentFactory.createResource(
0471             "gate.corpora.DocumentImpl", params, features
0472             );
0473 
0474         doc.setName(files[i].getName());
0475         if (doc == null)
0476           continue;
0477         processDocument(doc);
0478         final LanguageResource lr = sds.adopt(doc, null);
0479         sds.sync(lr);
0480         javax.swing.SwingUtilities.invokeLater(new Runnable() {
0481           public void run() {
0482             Factory.deleteResource(doc);
0483             Factory.deleteResource(lr);
0484           }
0485         });
0486       //for
0487       sds.close();
0488     }
0489     catch (java.net.MalformedURLException ex) {
0490       throw (GateRuntimeException)
0491         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0492         .initCause(ex);
0493     }
0494     catch (PersistenceException ex1) {
0495       throw (GateRuntimeException)
0496         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0497         .initCause(ex1);
0498     }
0499     catch (ResourceInstantiationException ex2) {
0500       throw (GateRuntimeException)
0501         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0502         .initCause(ex2);
0503     }
0504     catch (gate.security.SecurityException ex3) {
0505       throw (GateRuntimeException)
0506         new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage())
0507         .initCause(ex3);
0508     }
0509   //generateCorpus
0510 
0511   protected void evaluateCorpus(File fileDir,
0512                                 File processedDir, File markedDir,
0513                                 File errorDir) {
0514     //1. check if we have input files and the processed Dir
0515     if (fileDir == null || !fileDir.exists())
0516       return;
0517     if (processedDir == null || !processedDir.exists())
0518 
0519       //if the user wants evaluation of marked and stored that's not possible
0520       if (isMarkedStored) {
0521         Out.prln("Cannot evaluate because no processed documents exist.");
0522         return;
0523       }
0524       else
0525         isMarkedClean = true;
0526 
0527         // create the error directory or clean it up if needed
0528     File errDir = null;
0529     if (isMoreInfoMode) {
0530       errDir = errorDir;
0531       if (errDir == null) {
0532         errDir = new File(currDir, ERROR_DIR_NAME);
0533       }
0534       else {
0535         // get rid of the directory, coz we wants it clean
0536         if (!Files.rmdir(errDir))
0537           Out.prln("cannot delete old error directory: " + errDir);
0538       }
0539       Out.prln("Create error directory: " + errDir + "<BR><BR>");
0540       errDir.mkdir();
0541     }
0542 
0543     //looked for marked texts only if the directory exists
0544     boolean processMarked = markedDir != null && markedDir.exists();
0545     if (!processMarked && (isMarkedStored || isMarkedClean)) {
0546       Out.prln("Cannot evaluate because no human-annotated documents exist.");
0547       return;
0548     }
0549 
0550     if (isMarkedStored) {
0551       evaluateMarkedStored(markedDir, processedDir, errDir);
0552       return;
0553     }
0554     else if (isMarkedClean) {
0555       evaluateMarkedClean(markedDir, fileDir, errDir);
0556       return;
0557     }
0558 
0559     Document persDoc = null;
0560     Document cleanDoc = null;
0561     Document markedDoc = null;
0562 
0563     //open the datastore and process each document
0564     try {
0565       //open the data store
0566       DataStore sds = Factory.openDataStore
0567                       ("gate.persist.SerialDataStore",
0568                        processedDir.toURI().toURL().toExternalForm());
0569 
0570       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0571       for (int i = 0; i < lrIDs.size(); i++) {
0572         String docID = (StringlrIDs.get(i);
0573 
0574         //read the stored document
0575         FeatureMap features = Factory.newFeatureMap();
0576         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0577         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0578         FeatureMap hparams = Factory.newFeatureMap();
0579 //        Gate.setHiddenAttribute(hparams, true);
0580 
0581         persDoc = (DocumentFactory.createResource(
0582             "gate.corpora.DocumentImpl",
0583             features, hparams);
0584 
0585         if (isMoreInfoMode) {
0586           StringBuffer errName = new StringBuffer(persDoc.getName());
0587           errName.replace(
0588               persDoc.getName().lastIndexOf("."),
0589               persDoc.getName().length(),
0590               ".err");
0591           Out.prln("<H2>" +
0592                    "<a href=\"err/" + errName.toString() "\">"
0593                    + persDoc.getName() "</a>" "</H2>");
0594         }
0595         else
0596           Out.prln("<H2>" + persDoc.getName() "</H2>");
0597 
0598         File cleanDocFile = new File(fileDir, persDoc.getName());
0599         //try reading the original document from clean
0600         if (!cleanDocFile.exists()) {
0601           Out.prln("Warning: Cannot find original document " +
0602                    persDoc.getName() " in " + fileDir);
0603         }
0604         else {
0605           FeatureMap params = Factory.newFeatureMap();
0606           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURI().toURL());
0607           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0608                      documentEncoding);
0609 
0610           // create the document
0611           cleanDoc = (DocumentFactory.createResource(
0612               "gate.corpora.DocumentImpl", params, hparams);
0613           cleanDoc.setName(persDoc.getName());
0614         }
0615 
0616         //try finding the marked document
0617         StringBuffer docName = new StringBuffer(persDoc.getName());
0618         if (!isMarkedDS) {
0619           docName.replace(
0620               persDoc.getName().lastIndexOf("."),
0621               docName.length(),
0622               ".xml");
0623           File markedDocFile = new File(markedDir, docName.toString());
0624           if (!processMarked || !markedDocFile.exists()) {
0625             Out.prln("Warning: Cannot find human-annotated document " +
0626                      markedDocFile + " in " + markedDir);
0627           }
0628           else {
0629             FeatureMap params = Factory.newFeatureMap();
0630             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0631                        markedDocFile.toURI().toURL());
0632             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0633                        documentEncoding);
0634 
0635             // create the document
0636             markedDoc = (DocumentFactory.createResource(
0637                 "gate.corpora.DocumentImpl", params, hparams);
0638             markedDoc.setName(persDoc.getName());
0639           }
0640         }
0641         else {
0642           //open marked from a DS
0643           //open the data store
0644           DataStore sds1 = Factory.openDataStore
0645                            ("gate.persist.SerialDataStore",
0646                             markedDir.toURI().toURL().toExternalForm());
0647 
0648           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0649           boolean found = false;
0650           int k = 0;
0651           //search for the marked doc with the same name
0652           while (k < lrIDs1.size() && !found) {
0653             String docID1 = (StringlrIDs1.get(k);
0654 
0655             //read the stored document
0656             FeatureMap features1 = Factory.newFeatureMap();
0657             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0658             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0659             Document tempDoc = (DocumentFactory.createResource(
0660                 "gate.corpora.DocumentImpl",
0661                 features1, hparams);
0662             //check whether this is our doc
0663             if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
0664                 endsWith(persDoc.getName())) {
0665               found = true;
0666               markedDoc = tempDoc;
0667             }
0668             else k++;
0669           }
0670         }
0671 
0672         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0673 
0674         if (persDoc != null) {
0675           final gate.Document pd = persDoc;
0676           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0677             public void run() {
0678               Factory.deleteResource(pd);
0679             }
0680           });
0681         }
0682         if (cleanDoc != null) {
0683           final gate.Document cd = cleanDoc;
0684           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0685             public void run() {
0686               Factory.deleteResource(cd);
0687             }
0688           });
0689         }
0690         if (markedDoc != null) {
0691           final gate.Document md = markedDoc;
0692           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0693             public void run() {
0694               Factory.deleteResource(md);
0695             }
0696           });
0697         }
0698 
0699       //for loop through saved docs
0700       sds.close();
0701     }
0702     catch (java.net.MalformedURLException ex) {
0703       throw (GateRuntimeException)
0704         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0705         .initCause(ex);
0706     }
0707     catch (PersistenceException ex1) {
0708       throw (GateRuntimeException)
0709         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0710         .initCause(ex1);
0711     }
0712     catch (ResourceInstantiationException ex2) {
0713       throw (GateRuntimeException)
0714         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0715         .initCause(ex2);
0716     }
0717 
0718   //evaluateCorpus
0719 
0720   protected void evaluateMarkedStored(File markedDir, File storedDir,
0721                                       File errDir) {
0722     Document persDoc = null;
0723     Document cleanDoc = null;
0724     Document markedDoc = null;
0725 
0726     //open the datastore and process each document
0727     try {
0728       //open the data store
0729       DataStore sds = Factory.openDataStore
0730                       ("gate.persist.SerialDataStore",
0731                        storedDir.toURI().toURL().toExternalForm());
0732 
0733       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
0734       for (int i = 0; i < lrIDs.size(); i++) {
0735         String docID = (StringlrIDs.get(i);
0736 
0737         //read the stored document
0738         FeatureMap features = Factory.newFeatureMap();
0739         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
0740         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
0741 
0742         FeatureMap hparams = Factory.newFeatureMap();
0743 //        Gate.setHiddenAttribute(hparams, true);
0744 
0745         persDoc = (DocumentFactory.createResource(
0746             "gate.corpora.DocumentImpl",
0747             features, hparams);
0748 
0749         if (isMoreInfoMode) {
0750           StringBuffer errName = new StringBuffer(persDoc.getName());
0751           errName.replace(
0752               persDoc.getName().lastIndexOf("."),
0753               persDoc.getName().length(),
0754               ".err");
0755           Out.prln("<H2>" +
0756                    "<a href=\"err/" + errName.toString() "\">"
0757                    + persDoc.getName() "</a>" "</H2>");
0758         }
0759         else
0760           Out.prln("<H2>" + persDoc.getName() "</H2>");
0761 
0762         if (!this.isMarkedDS) { //try finding the marked document as file
0763           StringBuffer docName = new StringBuffer(persDoc.getName());
0764           docName.replace(
0765               persDoc.getName().lastIndexOf("."),
0766               docName.length(),
0767               ".xml");
0768           File markedDocFile = new File(markedDir, docName.toString());
0769           if (!markedDocFile.exists()) {
0770             Out.prln("Warning: Cannot find human-annotated document " +
0771                      markedDocFile + " in " + markedDir);
0772           }
0773           else {
0774             FeatureMap params = Factory.newFeatureMap();
0775             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0776                        markedDocFile.toURI().toURL());
0777             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
0778                        documentEncoding);
0779 
0780             // create the document
0781             markedDoc = (DocumentFactory.createResource(
0782                 "gate.corpora.DocumentImpl", params, hparams);
0783             markedDoc.setName(persDoc.getName());
0784           //find marked as file
0785         }
0786         else {
0787           try {
0788             //open marked from a DS
0789             //open the data store
0790             DataStore sds1 = Factory.openDataStore
0791                              ("gate.persist.SerialDataStore",
0792                               markedDir.toURI().toURL().toExternalForm());
0793 
0794             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0795             boolean found = false;
0796             int k = 0;
0797             //search for the marked doc with the same name
0798             while (k < lrIDs1.size() && !found) {
0799               String docID1 = (StringlrIDs1.get(k);
0800 
0801               //read the stored document
0802               FeatureMap features1 = Factory.newFeatureMap();
0803               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0804               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0805               Document tempDoc = (DocumentFactory.createResource(
0806                   "gate.corpora.DocumentImpl",
0807                   features1, hparams);
0808               //check whether this is our doc
0809               if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
0810                   endsWith(persDoc.getName())) {
0811                 found = true;
0812                 markedDoc = tempDoc;
0813               }
0814               else k++;
0815             }
0816           }
0817           catch (java.net.MalformedURLException ex) {
0818             Out.prln("Error finding marked directory " +
0819                      markedDir.getAbsolutePath());
0820           }
0821           catch (gate.persist.PersistenceException ex1) {
0822             Out.prln(
0823                 "Error opening marked as a datastore (-marked_ds specified)");
0824           }
0825           catch (gate.creole.ResourceInstantiationException ex2) {
0826             Out.prln(
0827                 "Error opening marked as a datastore (-marked_ds specified)");
0828           }
0829         }
0830 
0831         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
0832         if (persDoc != null) {
0833           final gate.Document pd = persDoc;
0834           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0835             public void run() {
0836               Factory.deleteResource(pd);
0837             }
0838           });
0839         }
0840         if (markedDoc != null) {
0841           final gate.Document md = markedDoc;
0842           javax.swing.SwingUtilities.invokeLater(new Runnable() {
0843             public void run() {
0844               Factory.deleteResource(md);
0845             }
0846           });
0847         }
0848 
0849       //for loop through saved docs
0850       sds.close();
0851 
0852     }
0853     catch (java.net.MalformedURLException ex) {
0854       throw (GateRuntimeException)
0855         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
0856         .initCause(ex);
0857     }
0858     catch (PersistenceException ex1) {
0859       throw (GateRuntimeException)
0860         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
0861         .initCause(ex1);
0862     }
0863     catch (ResourceInstantiationException ex2) {
0864       throw (GateRuntimeException)
0865         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
0866         .initCause(ex2);
0867     }
0868 
0869   //evaluateMarkedStored
0870 
0871   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
0872     Document persDoc = null;
0873     Document cleanDoc = null;
0874     Document markedDoc = null;
0875 
0876     File[] cleanDocs = cleanDir.listFiles();
0877     for (int i = 0; i < cleanDocs.length; i++) {
0878       if (!cleanDocs[i].isFile())
0879         continue;
0880 
0881       //try reading the original document from clean
0882       FeatureMap params = Factory.newFeatureMap();
0883       try {
0884         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURI().toURL());
0885       }
0886       catch (java.net.MalformedURLException ex) {
0887         Out.prln("Cannot create document from file: " +
0888                  cleanDocs[i].getAbsolutePath());
0889         continue;
0890       }
0891       //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0892       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0893 
0894       FeatureMap hparams = Factory.newFeatureMap();
0895 //      Gate.setHiddenAttribute(hparams, true);
0896 
0897       // create the document
0898       try {
0899         cleanDoc = (DocumentFactory.createResource(
0900             "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
0901       }
0902       catch (gate.creole.ResourceInstantiationException ex) {
0903         Out.prln("Cannot create document from file: " +
0904                  cleanDocs[i].getAbsolutePath());
0905         continue;
0906       }
0907 
0908       if (isMoreInfoMode) {
0909         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
0910         errName.replace(
0911             cleanDocs[i].getName().lastIndexOf("."),
0912             cleanDocs[i].getName().length(),
0913             ".err");
0914         Out.prln("<H2>" +
0915                  "<a href=\"err/" + errName.toString() "\">"
0916                  + cleanDocs[i].getName() "</a>" "</H2>");
0917       }
0918       else
0919         Out.prln("<H2>" + cleanDocs[i].getName() "</H2>");
0920 
0921         //try finding the marked document
0922       if (!isMarkedDS) {
0923         StringBuffer docName = new StringBuffer(cleanDoc.getName());
0924         docName.replace(
0925             cleanDoc.getName().lastIndexOf("."),
0926             docName.length(),
0927             ".xml");
0928         File markedDocFile = new File(markedDir, docName.toString());
0929         if (!markedDocFile.exists()) {
0930           Out.prln("Warning: Cannot find human-annotated document " +
0931                    markedDocFile + " in " + markedDir);
0932           continue;
0933         }
0934         else {
0935           params = Factory.newFeatureMap();
0936           try {
0937             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
0938                        markedDocFile.toURI().toURL());
0939           }
0940           catch (java.net.MalformedURLException ex) {
0941             Out.prln("Cannot create document from file: " +
0942                      markedDocFile.getAbsolutePath());
0943             continue;
0944           }
0945           //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
0946           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
0947 
0948           // create the document
0949           try {
0950             markedDoc = (DocumentFactory.createResource(
0951                 "gate.corpora.DocumentImpl", params,
0952                 hparams, cleanDoc.getName());
0953           }
0954           catch (gate.creole.ResourceInstantiationException ex) {
0955             Out.prln("Cannot create document from file: " +
0956                      markedDocFile.getAbsolutePath());
0957             continue;
0958           }
0959 
0960         //if markedDoc exists
0961       }
0962       else {
0963         try {
0964           //open marked from a DS
0965           //open the data store
0966           DataStore sds1 = Factory.openDataStore
0967                            ("gate.persist.SerialDataStore",
0968                             markedDir.toURI().toURL().toExternalForm());
0969 
0970           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
0971           boolean found = false;
0972           int k = 0;
0973           //search for the marked doc with the same name
0974           while (k < lrIDs1.size() && !found) {
0975             String docID1 = (StringlrIDs1.get(k);
0976 
0977             //read the stored document
0978             FeatureMap features1 = Factory.newFeatureMap();
0979             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
0980             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
0981             Document tempDoc = (DocumentFactory.createResource(
0982                 "gate.corpora.DocumentImpl",
0983                 features1, hparams);
0984             //check whether this is our doc
0985             if ( ( (StringtempDoc.getFeatures().get("gate.SourceURL")).
0986                 endsWith(cleanDoc.getName())) {
0987               found = true;
0988               markedDoc = tempDoc;
0989             }
0990             else k++;
0991           }
0992         }
0993         catch (java.net.MalformedURLException ex) {
0994           Out.prln("Error finding marked directory " +
0995                    markedDir.getAbsolutePath());
0996         }
0997         catch (gate.persist.PersistenceException ex1) {
0998           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
0999         }
1000         catch (gate.creole.ResourceInstantiationException ex2) {
1001           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1002         }
1003       //if using a DS for marked
1004 
1005       try {
1006         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
1007       }
1008       catch (gate.creole.ResourceInstantiationException ex) {
1009         ex.printStackTrace();
1010         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
1011       }
1012       if (persDoc != null) {
1013         final gate.Document pd = persDoc;
1014         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1015           public void run() {
1016             Factory.deleteResource(pd);
1017           }
1018         });
1019       }
1020       if (cleanDoc != null) {
1021         final gate.Document cd = cleanDoc;
1022         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1023           public void run() {
1024             Factory.deleteResource(cd);
1025           }
1026         });
1027       }
1028       if (markedDoc != null) {
1029         final gate.Document md = markedDoc;
1030         javax.swing.SwingUtilities.invokeLater(new Runnable() {
1031           public void run() {
1032             Factory.deleteResource(md);
1033           }
1034         });
1035       }
1036 
1037     //for loop through clean docs
1038 
1039   //evaluateMarkedClean
1040 
1041   protected void processDocument(Document doc) {
1042     try {
1043       if (application instanceof CorpusController) {
1044         Corpus tempCorpus = Factory.newCorpus("temp");
1045         tempCorpus.add(doc);
1046         ( (CorpusControllerapplication).setCorpus(tempCorpus);
1047         application.execute();
1048         Factory.deleteResource(tempCorpus);
1049         tempCorpus = null;
1050       }
1051       else {
1052         Iterator iter = application.getPRs().iterator();
1053         while (iter.hasNext())
1054           ( (ProcessingResourceiter.next()).setParameterValue("document", doc);
1055         application.execute();
1056       }
1057     }
1058     catch (ResourceInstantiationException ex) {
1059       throw (RuntimeException)
1060         new RuntimeException("Error executing application: "
1061                                  + ex.getMessage())
1062         .initCause(ex);
1063     }
1064     catch (ExecutionException ex) {
1065       throw (RuntimeException)
1066         new RuntimeException("Error executing application: "
1067                                  + ex.getMessage())
1068         .initCause(ex);
1069     }
1070   }
1071 
1072   protected void evaluateDocuments(Document persDoc,
1073                                    Document cleanDoc, Document markedDoc,
1074                                    File errDirthrows
1075       ResourceInstantiationException {
1076     if (cleanDoc == null && markedDoc == null)
1077       return;
1078 
1079     //we've got no types to compare
1080     if (annotTypes == null || annotTypes.isEmpty())
1081       return;
1082 
1083     if (cleanDoc != null && !isMarkedStored) {
1084 
1085       processDocument(cleanDoc);
1086 
1087       int wordCount = countWords(cleanDoc);
1088       if (wordCount == 0)
1089         Out.prln("<BR>No Token annotations to count words in the document.");
1090       else
1091         Out.prln("<BR>Word count: " + wordCount);
1092       corpusWordCount += wordCount;
1093 
1094       if (!isMarkedClean)
1095         evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
1096       else
1097         evaluateTwoDocs(markedDoc, cleanDoc, errDir);
1098 
1099     }
1100     else
1101       evaluateTwoDocs(markedDoc, persDoc, errDir);
1102 
1103   }
1104 
1105   /**
1106    * Count all Token.kind=word annotations in the document
1107    */
1108   protected int countWords(Document annotDoc) {
1109     int count = 0;
1110 
1111     if (annotDoc == null)return 0;
1112     // check for Token in outputSetName
1113     AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
1114     if (tokens == null)return 0;
1115 
1116     Iterator<Annotation> it = tokens.iterator();
1117     Annotation currAnnotation;
1118     while (it.hasNext()) {
1119       currAnnotation = it.next();
1120       Object feature = currAnnotation.getFeatures().get("kind");
1121       if (feature != null && "word".equalsIgnoreCase( (Stringfeature))++count;
1122     // while
1123 
1124     return count;
1125   }
1126 
1127   protected void evaluateAllThree(Document persDoc,
1128                                   Document cleanDoc, Document markedDoc,
1129                                   File errDirthrows
1130       ResourceInstantiationException {
1131     //first start the table and its header
1132     printTableHeader();
1133 
1134     // store annotation diff in .err file
1135     Writer errWriter = null;
1136     if (isMoreInfoMode && errDir != null) {
1137       StringBuffer docName = new StringBuffer(cleanDoc.getName());
1138       docName.replace(
1139           cleanDoc.getName().lastIndexOf("."),
1140           docName.length(),
1141           ".err");
1142       File errFile = new File(errDir, docName.toString());
1143       String encoding = ( (gate.corpora.DocumentImplcleanDoc).getEncoding();
1144       try {
1145         errWriter = new FileWriter(errFile, false);
1146         /*
1147                  if(encoding == null) {
1148           errWriter = new OutputStreamWriter(
1149               new FileOutputStream(errFile, false));
1150                  } else {
1151           errWriter = new OutputStreamWriter(
1152               new FileOutputStream(errFile, false), encoding);
1153                  }*/
1154       }
1155       catch (Exception ex) {
1156         Out.prln("Exception when creating the error file " + errFile + ": "
1157                  + ex.getMessage());
1158         errWriter = null;
1159       }
1160     }
1161 
1162     for (int jj = 0; jj < annotTypes.size(); jj++) {
1163       String annotType = (StringannotTypes.get(jj);
1164 
1165       AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1166       //we don't have this annotation type in this document
1167       if (annotDiffer == null)
1168         continue;
1169 
1170       //increase the number of processed documents
1171       docNumber++;
1172       //add precison and recall to the sums
1173       updateStatistics(annotDiffer, annotType);
1174 
1175       AnnotationDiffer annotDiffer1 =
1176           measureDocs(markedDoc, persDoc, annotType);
1177 
1178       Out.prln("<TR>");
1179 
1180       if (isMoreInfoMode && annotDiffer1 != null
1181           &&
1182           (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1183            || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1184           )
1185         Out.prln("<TD> " + annotType + "_new" "</TD>");
1186       else
1187         Out.prln("<TD> " + annotType + "</TD>");
1188 
1189       if (isMoreInfoMode) {
1190         if (annotDiffer1 != nullupdateStatisticsProc(annotDiffer1, annotType);
1191 
1192         Out.prln("<TD>" + annotDiffer.getCorrectMatches() "</TD>");
1193         Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() "</TD>");
1194         Out.prln("<TD>" + annotDiffer.getMissing() "</TD>");
1195         Out.prln("<TD>" + annotDiffer.getSpurious() "</TD>");
1196       }
1197 
1198       Out.prln("<TD>");
1199 
1200       //check the precision first
1201       if (annotDiffer1 != null) {
1202 
1203         if (annotDiffer1.getPrecisionAverage()
1204             < annotDiffer.getPrecisionAverage()) {
1205           Out.prln("<P><Font color=blue> ");
1206           Out.prln(annotDiffer.getPrecisionAverage());
1207 
1208           if (!isMoreInfoMode) {
1209             Out.pr("<BR>Precision increase on human-marked from ");
1210             Out.pr(annotDiffer1.getPrecisionAverage() " to ");
1211             Out.prln(annotDiffer.getPrecisionAverage());
1212           }
1213           Out.prln(" </Font></P>");
1214         }
1215         else if (annotDiffer1.getPrecisionAverage()
1216                  > annotDiffer.getPrecisionAverage()) {
1217           Out.prln("<P><Font color=red> ");
1218           Out.prln(annotDiffer.getPrecisionAverage());
1219 
1220           if (!isMoreInfoMode) {
1221             Out.pr("<BR>Precision decrease on human-marked from ");
1222             Out.pr(annotDiffer1.getPrecisionAverage() " to ");
1223             Out.prln(annotDiffer.getPrecisionAverage());
1224           }
1225           Out.prln(" </Font></P>");
1226         }
1227         else
1228           Out.prln("<P> " (doubleannotDiffer.getPrecisionAverage() +
1229                    " </P>");
1230       }
1231       else
1232         Out.prln("<P> " + annotDiffer.getPrecisionAverage() " </P>");
1233 
1234       Out.prln("</TD>");
1235 
1236       Out.prln("<TD>");
1237 
1238       //check the recall now
1239       if (annotDiffer1 != null) {
1240 
1241         if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1242           Out.prln("<P><Font color=blue> ");
1243           Out.prln(annotDiffer.getRecallAverage());
1244 
1245           if (!isMoreInfoMode) {
1246             Out.pr("<BR>Recall increase on human-marked from ");
1247             Out.pr(annotDiffer1.getRecallAverage() " to ");
1248             Out.prln(annotDiffer.getRecallAverage());
1249           }
1250           Out.prln(" </Font></P>");
1251         }
1252         else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1253           Out.prln("<P><Font color=red> ");
1254           Out.prln(annotDiffer.getRecallAverage());
1255 
1256           if (!isMoreInfoMode) {
1257             Out.pr("<BR>Recall decrease on human-marked from ");
1258             Out.pr(annotDiffer1.getRecallAverage() " to ");
1259             Out.prln(annotDiffer.getRecallAverage());
1260           }
1261           Out.prln(" </Font></P>");
1262         }
1263         else
1264           Out.prln("<P> " + annotDiffer.getRecallAverage() " </P>");
1265       }
1266       else
1267         Out.prln("<P> " + annotDiffer.getRecallAverage() " </P>");
1268 
1269       Out.prln("</TD>");
1270 
1271       //check the recall now
1272       if (isVerboseMode) {
1273         Out.prln("<TD>");
1274         if (annotDiffer.getRecallAverage() < threshold
1275             || annotDiffer.getPrecisionAverage() < threshold) {
1276           printAnnotations(annotDiffer, markedDoc, cleanDoc);
1277         }
1278         else {
1279           Out.prln("&nbsp;");
1280         }
1281         Out.prln("</TD>");
1282       }
1283 
1284       Out.prln("</TR>");
1285 
1286       // show one more table line for processed document
1287       if (isMoreInfoMode && annotDiffer1 != null
1288           &&
1289           (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1290            || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1291           ) {
1292 
1293         Out.prln("<TR>");
1294         Out.prln("<TD> " + annotType + "_old" "</TD>");
1295 
1296         Out.prln("<TD>" + annotDiffer1.getCorrectMatches() "</TD>");
1297         Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() "</TD>");
1298         Out.prln("<TD>" + annotDiffer1.getMissing() "</TD>");
1299         Out.prln("<TD>" + annotDiffer1.getSpurious() "</TD>");
1300 
1301         Out.prln("<TD>");
1302         if (annotDiffer1.getPrecisionAverage() <
1303             annotDiffer.getPrecisionAverage())
1304 
1305           Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1306                    "</Font></P>");
1307         else if (annotDiffer1.getPrecisionAverage() >
1308                  annotDiffer.getPrecisionAverage())
1309           Out.prln(
1310               "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1311               " </Font></P>");
1312         else
1313           Out.prln(annotDiffer1.getPrecisionAverage());
1314 
1315         Out.prln("</TD>");
1316 
1317         Out.prln("<TD>");
1318         if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1319           Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1320                    " </Font></P>");
1321         else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1322           Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1323                    " </Font></P>");
1324         else
1325           Out.prln(annotDiffer1.getRecallAverage());
1326 
1327         Out.prln("</TD>");
1328 
1329         //check the recall now
1330         if (isVerboseMode) {
1331           // create error file and start writing
1332 
1333           Out.prln("<TD>");
1334         if (annotDiffer.getRecallAverage() < threshold
1335             || annotDiffer.getPrecisionAverage() < threshold) {
1336             printAnnotations(annotDiffer, markedDoc, cleanDoc);
1337           }
1338           else {
1339             Out.prln("&nbsp;");
1340           }
1341           Out.prln("</TD>");
1342         }
1343         Out.prln("</TR>");
1344       // if(isMoreInfoMode && annotDiff1 != null)
1345 
1346       if (isMoreInfoMode && errDir != null)
1347         storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1348     //for loop through annotation types
1349     Out.prln("</TABLE>");
1350 
1351     try {
1352       if (errWriter != null)
1353         errWriter.close();
1354     }
1355     catch (Exception ex) {
1356       Out.prln("Exception on close of error file " + errWriter + ": "
1357                + ex.getMessage());
1358     }
1359   //evaluateAllThree
1360 
1361   protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1362                                  File errDirthrows
1363       ResourceInstantiationException {
1364 
1365     //first start the table and its header
1366     printTableHeader();
1367 
1368     // store annotation diff in .err file
1369     Writer errWriter = null;
1370     if (isMoreInfoMode && errDir != null) {
1371       StringBuffer docName = new StringBuffer(keyDoc.getName());
1372       docName.replace(
1373           keyDoc.getName().lastIndexOf("."),
1374           docName.length(),
1375           ".err");
1376       File errFile = new File(errDir, docName.toString());
1377       String encoding = ( (gate.corpora.DocumentImplkeyDoc).getEncoding();
1378       try {
1379         errWriter = new FileWriter(errFile, false);
1380         /*
1381                  if(encoding == null) {
1382           errWriter = new OutputStreamWriter(
1383               new FileOutputStream(errFile, false));
1384                  } else {
1385           errWriter = new OutputStreamWriter(
1386               new FileOutputStream(errFile, false), encoding);
1387                  }*/
1388       }
1389       catch (Exception ex) {
1390         Out.prln("Exception when creating the error file " + errFile + ": "
1391                  + ex.getMessage());
1392         errWriter = null;
1393       }
1394     }
1395 
1396     for (int jj = 0; jj < annotTypes.size(); jj++) {
1397       String annotType = (StringannotTypes.get(jj);
1398 
1399       AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1400       //we don't have this annotation type in this document
1401       if (annotDiff == null)
1402          continue;
1403 
1404       //increase the number of processed documents
1405       docNumber++;
1406       //add precison and recall to the sums
1407       updateStatistics(annotDiff, annotType);
1408 
1409       Out.prln("<TR>");
1410       Out.prln("<TD>" + annotType + "</TD>");
1411 
1412       if (isMoreInfoMode) {
1413         Out.prln("<TD>" + annotDiff.getCorrectMatches() "</TD>");
1414         Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() "</TD>");
1415         Out.prln("<TD>" + annotDiff.getMissing() "</TD>");
1416         Out.prln("<TD>" + annotDiff.getSpurious() "</TD>");
1417       }
1418 
1419       Out.prln("<TD>" + annotDiff.getPrecisionAverage() "</TD>");
1420       Out.prln("<TD>" + annotDiff.getRecallAverage() "</TD>");
1421       //check the recall now
1422       if (isVerboseMode) {
1423         Out.prln("<TD>");
1424         if (annotDiff.getRecallAverage() < threshold
1425             || annotDiff.getPrecisionAverage() < threshold) {
1426           printAnnotations(annotDiff, keyDoc, respDoc);
1427         }
1428         else {
1429           Out.prln("&nbsp;");
1430         }
1431         Out.prln("</TD>");
1432       }
1433       Out.prln("</TR>");
1434 
1435       if (isMoreInfoMode && errDir != null)
1436         storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1437     //for loop through annotation types
1438     Out.prln("</TABLE>");
1439 
1440     try {
1441       if (errWriter != null)
1442         errWriter.close();
1443     }
1444     catch (Exception ex) {
1445       Out.prln("Exception on close of error file " + errWriter + ": "
1446                + ex.getMessage());
1447     }
1448   //evaluateTwoDocs
1449 
1450   protected void printTableHeader() {
1451     Out.prln("<TABLE BORDER=1");
1452     Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1453 
1454     if (isMoreInfoMode)
1455       Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1456              "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1457 
1458     Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1459 
1460     if (isVerboseMode)
1461       Out.pr("<TD><B>Annotations</B></TD>");
1462 
1463     Out.prln("</TR>");
1464   }
1465 
1466   protected void updateStatistics(AnnotationDiffer annotDiffer,
1467                                   String annotType) {
1468     double precisionAverage = ( (double) ( (doubleannotDiffer.
1469                                           getPrecisionLenient() +
1470                                           annotDiffer.getPrecisionStrict()) /
1471                                (double) (2.0));
1472     if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1473     precisionSum += precisionAverage;
1474 
1475     double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1476                                        annotDiffer.getRecallStrict()) /
1477                             (double) (2.0));
1478     if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1479     recallSum += recallAverage;
1480 
1481     double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0+
1482                                          annotDiffer.getFMeasureStrict(1.0)) /
1483                               (double) (2.0));
1484     if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1485     fMeasureSum += fMeasureAverage;
1486 
1487     Double oldPrecision = (DoubleprecisionByType.get(annotType);
1488     if (oldPrecision == null)
1489       precisionByType.put(annotType, new Double(precisionAverage));
1490     else
1491       precisionByType.put(annotType,
1492                           new Double(oldPrecision.doubleValue() + precisionAverage));
1493 
1494     Integer precCount = (IntegerprCountByType.get(annotType);
1495     if (precCount == null)
1496       prCountByType.put(annotType, new Integer(1));
1497     else
1498       prCountByType.put(annotType, new Integer(precCount.intValue() 1));
1499 
1500     Double oldFMeasure = (DoublefMeasureByType.get(annotType);
1501     if (oldFMeasure == null)
1502       fMeasureByType.put(annotType, new Double(fMeasureAverage));
1503     else
1504       fMeasureByType.put(annotType,
1505                          new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1506 
1507     Integer fCount = (IntegerfMeasureCountByType.get(annotType);
1508     if (fCount == null)
1509       fMeasureCountByType.put(annotType, new Integer(1));
1510     else
1511       fMeasureCountByType.put(annotType, new Integer(fCount.intValue() 1));
1512 
1513     Double oldRecall = (DoublerecallByType.get(annotType);
1514     if (oldRecall == null)
1515       recallByType.put(annotType, new Double(recallAverage));
1516     else
1517       recallByType.put(annotType,
1518                        new Double(oldRecall.doubleValue() + recallAverage));
1519 
1520     Integer recCount = (IntegerrecCountByType.get(annotType);
1521     if (recCount == null)
1522       recCountByType.put(annotType, new Integer(1));
1523     else
1524       recCountByType.put(annotType, new Integer(recCount.intValue() 1));
1525 
1526       //Update the missing, spurious, correct, and partial counts
1527     Long oldMissingNo = (LongmissingByType.get(annotType);
1528     if (oldMissingNo == null)
1529       missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1530     else
1531       missingByType.put(annotType,
1532                         new Long(oldMissingNo.longValue() +
1533                                  annotDiffer.getMissing()));
1534 
1535     Long oldCorrectNo = (LongcorrectByType.get(annotType);
1536     if (oldCorrectNo == null)
1537       correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1538     else
1539       correctByType.put(annotType,
1540                         new Long(oldCorrectNo.longValue() +
1541                                  annotDiffer.getCorrectMatches()));
1542 
1543     Long oldPartialNo = (LongpartialByType.get(annotType);
1544     if (oldPartialNo == null)
1545       partialByType.put(annotType,
1546                         new Long(annotDiffer.getPartiallyCorrectMatches()));
1547     else
1548       partialByType.put(annotType,
1549                         new Long(oldPartialNo.longValue() +
1550                                  annotDiffer.getPartiallyCorrectMatches()));
1551 
1552     Long oldSpuriousNo = (LongspurByType.get(annotType);
1553     if (oldSpuriousNo == null)
1554       spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1555     else
1556       spurByType.put(annotType,
1557                      new Long(oldSpuriousNo.longValue() +
1558                               annotDiffer.getSpurious()));
1559   }
1560 
1561   /**
1562    * Update statistics for processed documents
1563    * The same procedure as updateStatistics with different hashTables
1564    */
1565   protected void updateStatisticsProc(AnnotationDiffer annotDiffer,
1566                                       String annotType) {
1567     hasProcessed = true;
1568     double precisionAverage = ( (double) (annotDiffer.getPrecisionLenient() +
1569                                           annotDiffer.getPrecisionStrict()) /
1570                                (double) (2.0));
1571     if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1572     proc_precisionSum += precisionAverage;
1573 
1574     double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1575                                        annotDiffer.getRecallStrict()) /
1576                             (double) (2.0));
1577     if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1578     proc_recallSum += recallAverage;
1579 
1580     double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0+
1581                                          annotDiffer.getFMeasureStrict(1.0)) /
1582                               (double) (2.0));
1583     if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1584     proc_fMeasureSum += fMeasureAverage;
1585 
1586     Double oldPrecision = (Doubleproc_precisionByType.get(annotType);
1587     if (oldPrecision == null)
1588       proc_precisionByType.put(annotType, new Double(precisionAverage));
1589     else
1590       proc_precisionByType.put(annotType,
1591                                new Double(oldPrecision.doubleValue() +
1592                                           precisionAverage));
1593     Integer precCount = (Integerproc_prCountByType.get(annotType);
1594     if (precCount == null)
1595       proc_prCountByType.put(annotType, new Integer(1));
1596     else
1597       proc_prCountByType.put(annotType, new Integer(precCount.intValue() 1));
1598 
1599     Double oldFMeasure = (Doubleproc_fMeasureByType.get(annotType);
1600     if (oldFMeasure == null)
1601       proc_fMeasureByType.put(annotType,
1602                               new Double(fMeasureAverage));
1603     else
1604       proc_fMeasureByType.put(annotType,
1605                               new Double(oldFMeasure.doubleValue() +
1606                                          fMeasureAverage));
1607     Integer fCount = (Integerproc_fMeasureCountByType.get(annotType);
1608     if (fCount == null)
1609       proc_fMeasureCountByType.put(annotType, new Integer(1));
1610     else
1611       proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() 1));
1612 
1613     Double oldRecall = (Doubleproc_recallByType.get(annotType);
1614     if (oldRecall == null)
1615       proc_recallByType.put(annotType,
1616                             new Double(recallAverage));
1617     else
1618       proc_recallByType.put(annotType,
1619                             new Double(oldRecall.doubleValue() +
1620                                        recallAverage));
1621     Integer recCount = (Integerproc_recCountByType.get(annotType);
1622     if (recCount == null)
1623       proc_recCountByType.put(annotType, new Integer(1));
1624     else
1625       proc_recCountByType.put(annotType, new Integer(recCount.intValue() 1));
1626 
1627       //Update the missing, spurious, correct, and partial counts
1628     Long oldMissingNo = (Longproc_missingByType.get(annotType);
1629     if (oldMissingNo == null)
1630       proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1631     else
1632       proc_missingByType.put(annotType,
1633                              new Long(oldMissingNo.longValue() +
1634                                       annotDiffer.getMissing()));
1635 
1636     Long oldCorrectNo = (Longproc_correctByType.get(annotType);
1637     if (oldCorrectNo == null)
1638       proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1639     else
1640       proc_correctByType.put(annotType,
1641                              new Long(oldCorrectNo.longValue() +
1642                                       annotDiffer.getCorrectMatches()));
1643 
1644     Long oldPartialNo = (Longproc_partialByType.get(annotType);
1645     if (oldPartialNo == null)
1646       proc_partialByType.put(annotType,
1647                              new Long(annotDiffer.getPartiallyCorrectMatches()));
1648     else
1649       proc_partialByType.put(annotType,
1650                              new Long(oldPartialNo.longValue() +
1651                                       annotDiffer.getPartiallyCorrectMatches()));
1652 
1653     Long oldSpuriousNo = (Longproc_spurByType.get(annotType);
1654     if (oldSpuriousNo == null)
1655       proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1656     else
1657       proc_spurByType.put(annotType,
1658                           new Long(oldSpuriousNo.longValue() +
1659                                    annotDiffer.getSpurious()));
1660   }
1661 
1662   public void printStatistics() {
1663 
1664     Out.prln("<H2> Statistics </H2>");
1665 
1666     /*
1667         Out.prln("<H3> Precision </H3>");
1668         if (precisionByType != null && !precisionByType.isEmpty()) {
1669           Iterator iter = precisionByType.keySet().iterator();
1670           while (iter.hasNext()) {
1671             String annotType = (String) iter.next();
1672             Out.prln(annotType + ": "
1673               + ((Double)precisionByType.get(annotType)).doubleValue()
1674                   /
1675                   ((Integer)prCountByType.get(annotType)).intValue()
1676               + "<P>");
1677           }//while
1678         }
1679         Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1680 
1681         Out.prln("<H3> Recall </H3>");
1682         if (recallByType != null && !recallByType.isEmpty()) {
1683           Iterator iter = recallByType.keySet().iterator();
1684           while (iter.hasNext()) {
1685             String annotType = (String) iter.next();
1686             Out.prln(annotType + ": "
1687               + ((Double)recallByType.get(annotType)).doubleValue()
1688                   /
1689                   ((Integer)recCountByType.get(annotType)).intValue()
1690               + "<P>");
1691           }//while
1692         }
1693 
1694         Out.prln("Overall recall: " + getRecallAverage()
1695                  + "<P>");
1696      */
1697     if (annotTypes == null) {
1698       Out.prln("No types given for evaluation, cannot obtain precision/recall");
1699       return;
1700     }
1701     Out.prln("<table border=1>");
1702     Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1703              "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1704              "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1705              "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1706     String annotType;
1707     for (int i = 0; i < annotTypes.size(); i++) {
1708       annotType = (StringannotTypes.get(i);
1709       printStatsForType(annotType);
1710     //for
1711     Out.prln("</table>");
1712   // updateStatisticsProc
1713 
1714   protected void printStatsForType(String annotType) {
1715     long correct = (correctByType.get(annotType== null:
1716                    ( (LongcorrectByType.get(annotType)).longValue();
1717     long partial = (partialByType.get(annotType== null:
1718                    ( (LongpartialByType.get(annotType)).longValue();
1719     long spurious = (spurByType.get(annotType== null:
1720                     ( (LongspurByType.get(annotType)).longValue();
1721     long missing = (missingByType.get(annotType== null:
1722                    ( (LongmissingByType.get(annotType)).longValue();
1723     long actual = correct + partial + spurious;
1724     long possible = correct + partial + missing;
1725     //precision strict is correct/actual
1726     //precision is (correct + 0.5 * partially correct)/actual
1727     double precision = 0d;
1728     if (actual!=0)
1729       precision = (correct + 0.5 * partial/ actual;
1730     
1731     //recall strict is correct/possible
1732     double recall = 0d;
1733     if (possible!=0)
1734       recall = (correct + 0.5 * partial/ possible;
1735     
1736     //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1737     double fmeasure = 0d;
1738     if ((beta * beta * precision+ recall !=0){
1739       fmeasure =
1740         ( (beta * beta + 1* precision * recall)
1741         /
1742         ( (beta * beta * precision+ recall);
1743     }
1744 
1745     long proc_correct = 0;
1746     long proc_partial = 0;
1747     long proc_spurious = 0;
1748     long proc_missing = 0;
1749     long proc_actual = 0;
1750     long proc_possible = 0;
1751     double proc_precision = 0;
1752     double proc_recall = 0;
1753     double proc_fmeasure = 0;
1754 
1755     if (hasProcessed) {
1756       // calculate values for processed
1757       proc_correct = (proc_correctByType.get(annotType== null:
1758                      ( (Longproc_correctByType.get(annotType)).longValue();
1759       proc_partial = (proc_partialByType.get(annotType== null:
1760                      ( (Longproc_partialByType.get(annotType)).longValue();
1761       proc_spurious = (proc_spurByType.get(annotType== null:
1762                       ( (Longproc_spurByType.get(annotType)).longValue();
1763       proc_missing = (proc_missingByType.get(annotType== null:
1764                      ( (Longproc_missingByType.get(annotType)).longValue();
1765       proc_actual = proc_correct + proc_partial + proc_spurious;
1766       proc_possible = proc_correct + proc_partial + proc_missing;
1767       //precision strict is correct/actual
1768       //precision is (correct + 0.5 * partially correct)/actual
1769       proc_precision = (proc_correct + 0.5 * proc_partial/ proc_actual;
1770       //recall strict is correct/possible
1771       proc_recall = (proc_correct + 0.5 * proc_partial/ proc_possible;
1772       //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1773       proc_fmeasure =
1774           ( (beta * beta + 1* proc_precision * proc_recall)
1775           /
1776           ( (beta * beta * proc_precision+ proc_recall);
1777 
1778     }
1779 
1780     // output data
1781     Out.prln("<TR>");
1782     if (hasProcessed)
1783       Out.prln("<TD>" + annotType + "_new" "</TD>");
1784     else
1785       Out.prln("<TD>" + annotType + "</TD>");
1786 
1787     Out.prln("<TD>" + correct + "</TD>");
1788     Out.prln("<TD>" + partial + "</TD>");
1789     Out.prln("<TD>" + missing + "</TD>");
1790     Out.prln("<TD>" + spurious + "</TD>");
1791 
1792     String strPrec = (isMoreInfoMode?
1793                      avgPrint(precision, 4)
1794                      : Double.toString(precision);
1795     String strRec = (isMoreInfoMode?
1796                     avgPrint(recall, 4)
1797                     : Double.toString(recall);
1798     String strFmes = (isMoreInfoMode?
1799                      avgPrint(fmeasure, 4)
1800                      : Double.toString(fmeasure);
1801 
1802     if (hasProcessed && (precision < proc_precision))
1803       Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1804     else if (hasProcessed && (precision > proc_precision))
1805       Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1806     else
1807       Out.prln("<TD>" + strPrec + "</TD>");
1808     if (hasProcessed && (recall < proc_recall))
1809       Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1810     else if (hasProcessed && (recall > proc_recall))
1811       Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1812     else
1813       Out.prln("<TD>" + strRec + "</TD>");
1814     Out.prln("<TD>" + strFmes + "</TD>");
1815     Out.prln("</TR>");
1816 
1817     if (hasProcessed) {
1818       // output data
1819       Out.prln("<TR>");
1820       Out.prln("<TD>" + annotType + "_old" "</TD>");
1821 
1822       Out.prln("<TD>" + proc_correct + "</TD>");
1823       Out.prln("<TD>" + proc_partial + "</TD>");
1824       Out.prln("<TD>" + proc_missing + "</TD>");
1825       Out.prln("<TD>" + proc_spurious + "</TD>");
1826 
1827       String strProcPrec = (isMoreInfoMode?
1828                            avgPrint(proc_precision, 4)
1829                            : Double.toString(proc_precision);
1830       String strProcRec = (isMoreInfoMode?
1831                           avgPrint(proc_recall, 4)
1832                           : Double.toString(proc_recall);
1833       String strProcFmes = (isMoreInfoMode?
1834                            avgPrint(proc_fmeasure, 4)
1835                            : Double.toString(proc_fmeasure);
1836 
1837       if (precision < proc_precision)
1838         Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1839       else if (precision > proc_precision)
1840         Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1841       else
1842         Out.prln("<TD>" + strProcPrec + "</TD>");
1843       if (recall < proc_recall)
1844         Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1845       else if (recall > proc_recall)
1846         Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1847       else
1848         Out.prln("<TD>" + strProcRec + "</TD>");
1849       Out.prln("<TD>" + strProcFmes + "</TD>");
1850       Out.prln("</TR>");
1851     }
1852   //printStatsForType
1853 
1854   //** Print @param value with @param count digits after decimal point */
1855   protected String avgPrint(double value, int count) {
1856     double newvalue;
1857     double power = Math.pow(10, count);
1858     newvalue = Math.round(value * power/ power;
1859     return Double.toString(newvalue);
1860   }
1861 
1862   private double precisionSumCalc = 0;
1863   private double recallSumCalc = 0;
1864   private double fMeasureSumCalc = 0;
1865 
1866   public double getPrecisionAverageCalc() {
1867     return precisionSumCalc;
1868   }
1869 
1870   public double getRecallAverageCalc() {
1871     return recallSumCalc;
1872   }
1873 
1874   public double getFmeasureAverageCalc() {
1875     return fMeasureSumCalc;
1876   }
1877 
1878   protected void calculateAvgTotal() {
1879     long correct, partial, spurious, missing;
1880     long correctSum, partialSum, spuriousSum, missingSum;
1881 
1882     if (annotTypes == null) {
1883       return;
1884     }
1885     correctSum = partialSum = spuriousSum = missingSum = 0;
1886 
1887     String annotType;
1888     for (int i = 0; i < annotTypes.size(); i++) {
1889       annotType = (StringannotTypes.get(i);
1890       correct = (correctByType.get(annotType== null:
1891                 ( (LongcorrectByType.get(annotType)).longValue();
1892       partial = (partialByType.get(annotType== null:
1893                 ( (LongpartialByType.get(annotType)).longValue();
1894       spurious = (spurByType.get(annotType== null:
1895                  ( (LongspurByType.get(annotType)).longValue();
1896       missing = (missingByType.get(annotType== null:
1897                 ( (LongmissingByType.get(annotType)).longValue();
1898       correctSum += correct;
1899       partialSum += partial;
1900       spuriousSum += spurious;
1901       missingSum += missing;
1902     //for
1903 
1904     long actual = correctSum + partialSum + spuriousSum;
1905     long possible = correctSum + partialSum + missingSum;
1906 
1907     if (actual == 0) {
1908       precisionSumCalc = 0;
1909     }
1910     else {
1911       precisionSumCalc = (correctSum + 0.5 * partialSum/ actual;
1912     }
1913 
1914     if (possible == 0) {
1915       recallSumCalc = 0;
1916     }
1917     else {
1918       recallSumCalc = (correctSum + 0.5 * partialSum/ actual;
1919     }
1920 
1921     if (precisionSumCalc == && recallSumCalc == 0) {
1922       fMeasureSumCalc = 0;
1923     }
1924     else {
1925       fMeasureSumCalc =
1926           ( (beta * beta + 1* precisionSumCalc * recallSumCalc)
1927           /
1928           ( (beta * beta * precisionSumCalc+ recallSumCalc);
1929 
1930     }
1931   // calculateAvgTotal
1932 
1933   protected AnnotationDiffer measureDocs(
1934       Document keyDoc, Document respDoc, String annotTypethrows
1935       ResourceInstantiationException {
1936 
1937     if (keyDoc == null || respDoc == null)
1938       return null;
1939 
1940     if (annotSetName != null
1941         && keyDoc.getAnnotations(annotSetName).get(annotType== null)
1942       return null;
1943     else if ( (annotSetName == null || annotSetName.equals(""))
1944              && keyDoc.getAnnotations().get(annotType== null)
1945       return null;
1946 
1947     // create an annotation diff
1948     AnnotationDiffer annotDiffer = new AnnotationDiffer();
1949     // set the feature names set for annotation differ
1950     annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1951     // we need to find the sets
1952     AnnotationSet keys, responses;
1953     if (annotSetName == null || annotSetName.equals("")) {
1954       keys = keyDoc.getAnnotations().get(annotType);
1955       responses = respDoc.getAnnotations().get(annotType);
1956     }
1957     else {
1958       keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1959       responses = respDoc.getAnnotations(outputSetName).get(annotType);
1960     }
1961 
1962     // we have annotation sets so call the annotationDiffer
1963     List pairings = annotDiffer.calculateDiff(keys, responses);
1964     return annotDiffer;
1965   // measureDocs
1966 
1967   protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1968                                   Document keyDoc, Document respDoc,
1969                                   Writer errFileWriter) {
1970     if (errFileWriter == null)return// exit on "no file"
1971 
1972     try {
1973       // extract and store annotations
1974       Comparator comp = new OffsetComparator();
1975       TreeSet sortedSet = new TreeSet(comp);
1976       Set missingSet =
1977           annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1978       sortedSet.clear();
1979       sortedSet.addAll(missingSet);
1980       storeAnnotations(type + ".miss", sortedSet, keyDoc, errFileWriter);
1981       Set spuriousSet =
1982           annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1983       sortedSet.clear();
1984       sortedSet.addAll(spuriousSet);
1985       storeAnnotations(type + ".spur", sortedSet, respDoc, errFileWriter);
1986       Set partialSet =
1987           annotDiffer.getAnnotationsOfType(AnnotationDiffer.
1988                                            PARTIALLY_CORRECT_TYPE);
1989       sortedSet.clear();
1990       sortedSet.addAll(partialSet);
1991       storeAnnotations(type + ".part", sortedSet, respDoc, errFileWriter);
1992     }
1993     catch (Exception ex) {
1994       Out.prln("Exception on close of error file " + errFileWriter + ": "
1995                + ex.getMessage());
1996     }
1997   // storeAnnotations
1998 
1999   protected void storeAnnotations(String type, Set set, Document doc,
2000                                   Writer filethrows IOException {
2001 
2002     if (set == null || set.isEmpty())
2003       return;
2004 
2005     Iterator iter = set.iterator();
2006     Annotation ann;
2007     while (iter.hasNext()) {
2008       ann = (Annotationiter.next();
2009       file.write(type);
2010       file.write(".");
2011       file.write(doc.getContent().toString().substring(
2012           ann.getStartNode().getOffset().intValue(),
2013           ann.getEndNode().getOffset().intValue()));
2014       file.write(".");
2015       file.write(ann.getStartNode().getOffset().toString());
2016       file.write(".");
2017       file.write(ann.getEndNode().getOffset().toString());
2018       file.write("\n");
2019     //while
2020   // storeAnnotations
2021 
2022   protected void printAnnotations(AnnotationDiffer annotDiff,
2023                                   Document keyDoc, Document respDoc) {
2024     Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
2025     Set missingSet =
2026         annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2027     printAnnotations(missingSet, keyDoc);
2028     Out.prln("<BR>");
2029 
2030     Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
2031     Set spuriousSet =
2032         annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2033     printAnnotations(spuriousSet, respDoc);
2034     Out.prln("</BR>");
2035 
2036     Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
2037     Set partialSet =
2038         annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
2039     printAnnotations(partialSet, respDoc);
2040   }
2041 
2042   protected void printAnnotations(Set set, Document doc) {
2043     if (set == null || set.isEmpty())
2044       return;
2045 
2046     Iterator iter = set.iterator();
2047     while (iter.hasNext()) {
2048       Annotation ann = (Annotationiter.next();
2049       Out.prln(
2050           "<B>" +
2051           doc.getContent().toString().substring(
2052           ann.getStartNode().getOffset().intValue(),
2053           ann.getEndNode().getOffset().intValue()) +
2054           "</B>: <I>[" + ann.getStartNode().getOffset() +
2055           "," + ann.getEndNode().getOffset() "]</I>"
2056 //        + "; features" + ann.getFeatures()
2057           );
2058     //while
2059   //printAnnotations
2060 
2061   /**
2062    * The directory from which we should generate/evaluate the corpus
2063    */
2064   private File startDir;
2065   private File currDir;
2066   private static List annotTypes;
2067 
2068   private Controller application = null;
2069   private File applicationFile = null;
2070 
2071   //collect the sum of all precisions and recalls of all docs
2072   //and the number of docs, so I can calculate the average for
2073   //the corpus at the end
2074   private double precisionSum = 0.0;
2075   private double recallSum = 0.0;
2076   private double fMeasureSum = 0.0;
2077   private HashMap precisionByType = new HashMap();
2078   private HashMap prCountByType = new HashMap();
2079   private HashMap recallByType = new HashMap();
2080   private HashMap recCountByType = new HashMap();
2081   private HashMap fMeasureByType = new HashMap();
2082   private HashMap fMeasureCountByType = new HashMap();
2083 
2084   private HashMap missingByType = new HashMap();
2085   private HashMap spurByType = new HashMap();
2086   private HashMap correctByType = new HashMap();
2087   private HashMap partialByType = new HashMap();
2088 
2089   // statistic for processed
2090   static boolean hasProcessed = false;
2091   private double proc_precisionSum = 0;
2092   private double proc_recallSum = 0;
2093   private double proc_fMeasureSum = 0;
2094   private HashMap proc_precisionByType = new HashMap();
2095   private HashMap proc_prCountByType = new HashMap();
2096   private HashMap proc_recallByType = new HashMap();
2097   private HashMap proc_recCountByType = new HashMap();
2098   private HashMap proc_fMeasureByType = new HashMap();
2099   private HashMap proc_fMeasureCountByType = new HashMap();
2100 
2101   private HashMap proc_missingByType = new HashMap();
2102   private HashMap proc_spurByType = new HashMap();
2103   private HashMap proc_correctByType = new HashMap();
2104   private HashMap proc_partialByType = new HashMap();
2105 
2106   double beta = 1;
2107 
2108   private int docNumber = 0;
2109 
2110   /**
2111    * If true, the corpus tool will generate the corpus, otherwise it'll
2112    * run in evaluate mode
2113    */
2114   private boolean isGenerateMode = false;
2115 
2116   /**
2117    * If true - show annotations for docs below threshold
2118    */
2119   private boolean isVerboseMode = false;
2120 
2121   /**
2122    * If true - show more info in document table
2123    */
2124   private boolean isMoreInfoMode = false;
2125 
2126   /**
2127    * The list of features used in the AnnotationDiff separated by comma
2128    * Example: "class;inst"
2129    */
2130   private Set diffFeaturesSet;
2131 
2132   /**
2133    * If true, the corpus tool will evaluate stored against the human-marked
2134    * documents
2135    */
2136   private boolean isMarkedStored = false;
2137   private boolean isMarkedClean = false;
2138 
2139   //whether marked are in a DS, not xml
2140   private boolean isMarkedDS = false;
2141 
2142   private String annotSetName = "Key";
2143   private String outputSetName = null;
2144 
2145   private double threshold = 0.5;
2146   private Properties configs = new Properties();
2147   private static int corpusWordCount = 0;
2148 
2149   private String documentEncoding = "";
2150 
2151   /** String to print when wrong command-line args */
2152   private static String usage =
2153       "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
2154       "[-verbose] [-moreinfo] directory-name application";
2155 
2156 }