0001 /*
0002 * DocTimeReporter.java
0003 *
0004 * Copyright (c) 2008-2009, Intelius, Inc.
0005 *
0006 * This file is part of GATE (see http://gate.ac.uk/), and is free
0007 * software, licenced under the GNU Library General Public License,
0008 * Version 2, June 1991 (in the distribution as file licence.html,
0009 * and also available at http://gate.ac.uk/gate/licence.html).
0010 *
0011 * Chirag Viradiya & Andrew Borthwick, 30/Sep/2009
0012 *
0013 * $Id$
0014 */
0015 package gate.util.reporting;
0016
0017 import java.io.BufferedReader;
0018 import java.io.BufferedWriter;
0019 import java.io.File;
0020 import java.io.FileReader;
0021 import java.io.FileWriter;
0022 import java.io.IOException;
0023 import java.io.RandomAccessFile;
0024 import java.util.ArrayList;
0025 import java.util.Collections;
0026 import java.util.Date;
0027 import java.util.HashSet;
0028 import java.util.Iterator;
0029 import java.util.LinkedHashMap;
0030 import java.util.List;
0031 import java.util.StringTokenizer;
0032 import java.util.Timer;
0033 import java.util.TimerTask;
0034 import java.util.Vector;
0035 import java.util.regex.Matcher;
0036 import java.util.regex.Pattern;
0037
0038 import gate.util.reporting.exceptions.BenchmarkReportExecutionException;
0039 import gate.util.reporting.exceptions.BenchmarkReportFileAccessException;
0040 import gate.util.reporting.exceptions.BenchmarkReportInputFileFormatException;
0041 import gnu.getopt.Getopt;
0042
0043 /**
0044 * A reporter class to generate a report on time taken by each document within
0045 * given corpus.
0046 */
0047 public class DocTimeReporter implements BenchmarkReportable {
0048
0049 /** A File handle to input benchmark file. */
0050 private File benchmarkFile = new File("benchmark.txt");
0051 /** Report media. */
0052 private String printMedia = MEDIA_HTML;
0053 /** No of documents to be displayed against matching PRs. */
0054 private int maxDocumentInReport = 10;
0055 /** Search string, could be a PR name. */
0056 private String PRMatchingRegex = MATCH_ALL_PR_REGEX;
0057 /** A marker indicating the start of current logical run. */
0058 private String logicalStart = null;
0059 /** Path where to save the report file. */
0060 private File reportFile;
0061
0062 /**
0063 * An HashSet containing names of the documents matching the given search
0064 * string.
0065 */
0066 private HashSet<String> allDocs = new HashSet<String>();
0067 /**
0068 * An HashSet containing PR names matching the search string. Used to display
0069 * in report header.
0070 */
0071 private HashSet<String> matchingPRs = new HashSet<String>();
0072 /** Total time taken by the given pipeline for the current logical run. */
0073 private float globalTotal = 0;
0074 /** A LinkedHashMap containing the documents matching the given PRs. */
0075 private LinkedHashMap<String, Object> docContainer = new LinkedHashMap<String, Object>();
0076 /**
0077 * Folder where the benchmark.txt files are created for specific pipeline log
0078 * entries.
0079 */
0080 private File temporaryDirectory;
0081 /** Name of the given pipeline */
0082 private String pipelineName = "";
0083 /** Status flag for normal exit. */
0084 private static final int STATUS_NORMAL = 0;
0085 /** Status flag for error exit. */
0086 private static final int STATUS_ERROR = 1;
0087 /** Chunk size in which file will be read */
0088 private static final int FILE_CHUNK_SIZE = 2000;
0089 /** An OS independent line separator */
0090 private static final String NL = System.getProperty("line.separator");
0091 /**
0092 * An integer containing the count of total valid log entries present in input
0093 * file provided.
0094 */
0095 public int validEntries = 0;
0096
0097 /**
0098 * This string constant when set as print media indicates that the report is
0099 * printed in TEXT format.
0100 */
0101 public static final String MEDIA_TEXT = "text";
0102 /**
0103 * This string constant when set as print media indicates that the report is
0104 * printed in HTML format.
0105 */
0106 public static final String MEDIA_HTML = "html";
0107
0108 /**
0109 * This integer constant when set as No of Docs indicates that the report have
0110 * all the documents matching a given PR.
0111 */
0112 public static final int ALL_DOCS = -1;
0113
0114 /**
0115 * The default value for search string matching PRs for given run.
0116 */
0117 public static final String MATCH_ALL_PR_REGEX = "all_prs";
0118
0119 /**
0120 * No argument constructor.
0121 */
0122 public DocTimeReporter() {
0123 // some initialisations
0124 initTmpDir();
0125 }
0126
0127 /**
0128 * A constructor to be used while executing the tool from the command line.
0129 *
0130 * @param args array containing command line arguments.
0131 */
0132 DocTimeReporter(String[] args) {
0133 initTmpDir();
0134 parseArguments(args);
0135 }
0136
0137 private void initTmpDir() {
0138 try {
0139 temporaryDirectory = File.createTempFile("benchmark-reports", "", null);
0140 if (!temporaryDirectory.delete()
0141 || !temporaryDirectory.mkdir()) {
0142 throw new IOException("Unable to create temporary directory.\n"
0143 + temporaryDirectory.getCanonicalPath());
0144 }
0145 } catch (IOException e) {
0146 e.printStackTrace();
0147 }
0148 }
0149
0150
0151 /**
0152 * Calculates the total of the time taken by processing element at each leaf
0153 * level. Also calculates the difference between the actual time taken by the
0154 * resources and system noted time.
0155 *
0156 * @param reportContainer
0157 * An Object of type LinkedHashMap<String, Object> containing the
0158 * processing elements (with time in milliseconds) in hierarchical
0159 * structure.
0160 * @return An Object containing modified hierarchical structure of processing
0161 * elements with totals and All others embedded in it.
0162 */
0163 public Object calculate(Object reportContainer) {
0164 return sortHashMapByValues(
0165 doTotal((LinkedHashMap<String, Object>) reportContainer));
0166 }
0167
0168 /**
0169 * Sorts LinkedHashMap by its values(natural descending order). keeps the
0170 * duplicates as it is.
0171 *
0172 * @param passedMap
0173 * An Object of type LinkedHashMap to be sorted by its values.
0174 * @return An Object containing the sorted LinkedHashMap.
0175 */
0176 private LinkedHashMap sortHashMapByValues(LinkedHashMap passedMap) {
0177 List mapKeys = new ArrayList(passedMap.keySet());
0178 List mapValues = new ArrayList(passedMap.values());
0179
0180 Collections.sort(mapValues, new ValueComparator());
0181 Collections.sort(mapKeys);
0182 // Reversing the collection to sort the values in descending order
0183 Collections.reverse(mapValues);
0184 LinkedHashMap sortedMap = new LinkedHashMap();
0185
0186 Iterator<Integer> valueIt = mapValues.iterator();
0187 while (valueIt.hasNext()) {
0188 Object val = valueIt.next();
0189 Iterator<String> keyIt = mapKeys.iterator();
0190 while (keyIt.hasNext()) {
0191 Object key = keyIt.next();
0192 String comp1 = passedMap.get(key).toString();
0193 String comp2 = val.toString();
0194
0195 if (comp1.equals(comp2)) {
0196 passedMap.remove(key);
0197 mapKeys.remove(key);
0198 sortedMap.put(key, val);
0199 break;
0200 }
0201 }
0202 }
0203 return sortedMap;
0204 }
0205
0206 /**
0207 * Computes the sub totals at each processing level.
0208 *
0209 * @param reportContainer
0210 * An Object of type LinkedHashMap<String, Object> containing the
0211 * processing elements (with time in milliseconds) in hierarchical
0212 * structure.
0213 * @return An Object containing the LinkedHashMap with the element values
0214 * totaled.
0215 */
0216 private LinkedHashMap<String, Object> doTotal(
0217 LinkedHashMap<String, Object> reportContainer) {
0218 LinkedHashMap<String, Object> myHash =
0219 (LinkedHashMap<String, Object>) reportContainer;
0220 Iterator<String> i = myHash.keySet().iterator();
0221 while (i.hasNext()) {
0222 Object key = i.next();
0223 if (myHash.get(key) instanceof LinkedHashMap) {
0224 docContainer = doTotal((LinkedHashMap<String, Object>) (myHash
0225 .get(key)));
0226 } else {
0227 if (docContainer.get((String) key) == null) {
0228 docContainer.put((String) key, myHash.get(key));
0229 } else {
0230 // Do total if value already exists
0231 int val = Integer.parseInt((String) docContainer.get((String) key))
0232 + Integer.parseInt((String) myHash.get(key));
0233 docContainer.put((String) key, Integer.toString(val));
0234 }
0235 }
0236 }
0237 return docContainer;
0238 }
0239
0240 /**
0241 * Prints a report as per the value provided for print media option.
0242 *
0243 * @param reportSource
0244 * An Object of type LinkedHashMap<String, Object> containing the
0245 * processing elements (with time in milliseconds) in hierarchical
0246 * structure.
0247 * @param outputFile
0248 * Path where to save the report.
0249 */
0250 public void printReport(Object reportSource, File outputFile) {
0251 if (printMedia.equalsIgnoreCase(MEDIA_TEXT)) {
0252 printToText(reportSource, outputFile);
0253 } else if (printMedia.equalsIgnoreCase(MEDIA_HTML)) {
0254 printToHTML((LinkedHashMap<String, Object>) reportSource, outputFile);
0255 }
0256 }
0257
0258 /**
0259 * Prints benchmark report in text format.
0260 *
0261 * @param reportContainer
0262 * An Object of type LinkedHashMap<String, Object> containing the
0263 * document names (with time in milliseconds) in hierarchical
0264 * structure.
0265 * @param outputFile
0266 * An object of type File representing the output report file.
0267 */
0268 private void printToText(Object reportContainer, File outputFile) {
0269 ArrayList<String> printLines = new ArrayList<String>();
0270 LinkedHashMap<String, Object> rcHash =
0271 (LinkedHashMap<String, Object>) reportContainer;
0272 String docs = "";
0273 if (maxDocumentInReport != ALL_DOCS) {
0274 if (allDocs.size() < maxDocumentInReport) {
0275 docs = Integer.toString(allDocs.size());
0276 } else {
0277 docs = Integer.toString(maxDocumentInReport);
0278 }
0279
0280 } else {
0281 docs = "All";
0282 }
0283 printLines
0284 .add("============================================================="
0285 + NL);
0286 if (PRMatchingRegex.equals(MATCH_ALL_PR_REGEX)) {
0287 printLines.add("Top " + docs
0288 + " expensive documents matching All PRs in " + pipelineName
0289 + NL);
0290 } else {
0291 if (matchingPRs.size() > 0) {
0292 printLines.add("Top " + docs
0293 + " expensive documents matching following PRs in " + pipelineName
0294 + NL);
0295 for (String pr : matchingPRs) {
0296 printLines.add("\t" + pr + NL);
0297 }
0298 } else {
0299 printLines.add("No PRs matched to search string \""
0300 + getPRMatchingRegex() + "\"" + " in " + pipelineName);
0301 printLines.add(NL);
0302 printLines
0303 .add("============================================================="
0304 + NL);
0305 }
0306
0307 }
0308 if (allDocs.size() > 0) {
0309 printLines
0310 .add("============================================================="
0311 + NL);
0312 printLines.add("Document Name" + "\t" + "Time (in seconds)" + "\t" + "%"
0313 + NL);
0314 printLines
0315 .add("-------------------------------------------------------------"
0316 + NL);
0317 }
0318 Iterator<String> i = rcHash.keySet().iterator();
0319 int count = 0;
0320 // Iterating over the report container
0321 while (i.hasNext()) {
0322 Object key = i.next();
0323 if (!((String) key).equals("total")) {
0324 int value = Integer.parseInt((String) rcHash.get(key));
0325 if (maxDocumentInReport == ALL_DOCS)
0326 printLines.add(key + "\t" + value / 1000.0 + "\t"
0327 + Math.round(((value / globalTotal) * 100) * 10) / 10.0
0328 + NL);
0329 else if (count < maxDocumentInReport)
0330 printLines.add(key + "\t" + value / 1000.0 + "\t"
0331 + Math.round(((value / globalTotal) * 100) * 10) / 10.0
0332 + NL);
0333 }
0334 count++;
0335 }
0336 if (allDocs.size() > 0) {
0337 printLines
0338 .add("-------------------------------------------------------------"
0339 + NL);
0340 printLines.add("Pipeline Total" + "\t" + globalTotal / 1000.0 + "\t"
0341 + 100 + NL + NL + NL);
0342 }
0343 BufferedWriter out = null;
0344 try {
0345 // Writing to report file
0346 out = new BufferedWriter(new FileWriter(outputFile, true));
0347 for (String line : printLines) {
0348 out.write(line);
0349 }
0350
0351 } catch (IOException e) {
0352 e.printStackTrace();
0353
0354 } finally {
0355 try {
0356 if (out != null) { out.close(); }
0357 } catch (IOException e) {
0358 e.printStackTrace();
0359 }
0360 }
0361 }
0362
0363 /**
0364 * Stores GATE processing elements and the time taken by them in an in-memory
0365 * data structure for report generation.
0366 *
0367 * @param inputFile
0368 * A handle to the input benchmark file.
0369 *
0370 * @return An Object of type LinkedHashMap<String, Object> containing the
0371 * processing elements (with time in milliseconds) in hierarchical
0372 * structure. Null if there was an error.
0373 *
0374 * @throws BenchmarkReportInputFileFormatException
0375 * if the input file provided is not a valid benchmark file.
0376 */
0377 public Object store(File inputFile)
0378 throws BenchmarkReportInputFileFormatException {
0379 String[] temp = inputFile.getAbsolutePath().split("\\" + File.separator);
0380 pipelineName = temp[temp.length - 1].replace("_benchmark.txt", "");
0381 LinkedHashMap<String, Object> globalStore =
0382 new LinkedHashMap<String, Object>();
0383 BufferedReader in = null;
0384 try {
0385 in = new BufferedReader(new FileReader(inputFile));
0386 String str;
0387 String docName = null;
0388 String matchedPR = null;
0389 String startToken = null;
0390 // Reading the benchmark.txt one line at a time
0391 Pattern pattern = Pattern.compile("(\\d+) (\\d+) (.*) (.*) \\{(.*)\\}");
0392 // Pattern matching for extracting document name
0393 Pattern patternDocName = Pattern.compile(".*documentName=(.*?)[,|}].*");
0394 while ((str = in.readLine()) != null) {
0395 if (str.matches(".*START.*")) {
0396 String[] splittedStartEntry = str.split("\\s");
0397 if (splittedStartEntry.length > 2) {
0398 startToken = splittedStartEntry[2];
0399 } else {
0400 throw new BenchmarkReportInputFileFormatException(
0401 getBenchmarkFile() + " is invalid.");
0402 }
0403 }
0404 Matcher matcher = pattern.matcher(str);
0405 Matcher matcherDocName = patternDocName.matcher(str);
0406 Pattern patternDocEnd = Pattern.compile("(\\d+) (\\d+) " + Pattern.quote(startToken)
0407 + " (.*) \\{(.*)\\}.*");
0408 Matcher matcherDocEnd = patternDocEnd.matcher(str);
0409 if (matcherDocName != null) {
0410 if (matcherDocName.matches()) {
0411 docName = matcherDocName.group(1);
0412
0413 }
0414 }
0415 if (matcherDocEnd != null) {
0416 if (matcherDocEnd.matches()) {
0417
0418 globalTotal = globalTotal
0419 + Integer.parseInt(matcherDocEnd.group(2));
0420 }
0421 }
0422 if (matcher != null && matcher.matches()) {
0423 String benchmarkIDs = matcher.group(3).replaceFirst(Pattern.quote(startToken) + ".",
0424 "").replaceFirst("doc_" + Pattern.quote(docName) + ".", "");
0425 String[] splittedBenchmarkIDs = benchmarkIDs.split("\\.");
0426 // Getting the exact PR name and storing only entries matching PR name
0427 if (PRMatchingRegex.equals(MATCH_ALL_PR_REGEX)) {
0428 if (splittedBenchmarkIDs.length > 0) {
0429 matchedPR = splittedBenchmarkIDs[0];
0430 }
0431 if (!matchedPR.equalsIgnoreCase(startToken)) {
0432 organizeEntries(globalStore, matchedPR, matcher.group(2), docName);
0433 }
0434 } else if (isPRMatched(benchmarkIDs, PRMatchingRegex)) {
0435 if (splittedBenchmarkIDs.length > 0) {
0436 matchedPR = splittedBenchmarkIDs[0];
0437 }
0438 if (matchedPR != null)
0439 matchingPRs.add(matchedPR);
0440 organizeEntries(globalStore, matchedPR, matcher.group(2), docName);
0441 }
0442 }
0443 }
0444
0445 } catch (IOException e) {
0446 e.printStackTrace();
0447 globalStore = null;
0448
0449 } finally {
0450 try {
0451 if (in != null) { in.close(); }
0452 } catch (IOException e) {
0453 e.printStackTrace();
0454 globalStore = null;
0455 }
0456 }
0457 return globalStore;
0458 }
0459
0460 /**
0461 * Organizes the valid data extracted from the log entries into LinkedHashMap.
0462 *
0463 * @param store
0464 * A global LinkedHashMap containing the processing elements (with
0465 * time in milliseconds) in hierarchical structure.
0466 * @param matchedPR
0467 * A PR matching the given search string.
0468 * @param bTime
0469 * Time taken by the specific processing element.
0470 * @param docName
0471 * Name of the document being processed.
0472 */
0473 private void organizeEntries(LinkedHashMap<String, Object> store,
0474 String matchedPR, String bTime, String docName) {
0475 allDocs.add(docName);
0476 if (store.containsKey(matchedPR)) {
0477 ((LinkedHashMap<String, Object>) store.get(matchedPR))
0478 .put(docName, bTime);
0479 } else {
0480 LinkedHashMap<String, Object> tempLHM = new LinkedHashMap<String, Object>();
0481 tempLHM.put(docName, bTime);
0482 store.put(matchedPR, tempLHM);
0483 }
0484 }
0485
0486 /**
0487 * Prints the document level statistics report in HTML format.
0488 *
0489 * @param reportSource
0490 * An Object of type LinkedHashMap<String, Object> containing the
0491 * document names (with time in milliseconds).
0492 * @param outputFile
0493 * An object of type File representing the output report file to
0494 * which the HTML report is to be written.
0495 */
0496 private void printToHTML(LinkedHashMap<String, Object> reportSource,
0497 File outputFile) {
0498 String htmlReport =
0499 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + NL +
0500 "\"http://www.w3.org/TR/html4/loose.dtd\">" + NL +
0501 "<html><head><title>Benchmarking Report</title>" + NL +
0502 "<meta http-equiv=\"Content-Type\"" +
0503 " content=\"text/html; charset=utf-8\">" + NL +
0504 "<style type=\"text/css\">" + NL +
0505 "div { font-size:12px; margin-top: 4; }" + NL +
0506 "</style>" + NL +
0507 "</head>" + NL +
0508 "<body style=\"font-family:Verdana; color:navy;\">" + NL;
0509 String hTrace =
0510 "<div style=\"right: 0pt; border-top:1px solid #C9D7F1;" +
0511 " font-size:1px;\" ></div>" + NL;
0512 String reportTitle = hTrace;
0513 String docs = "";
0514 if (maxDocumentInReport != ALL_DOCS) {
0515 if (allDocs.size() < maxDocumentInReport) {
0516 docs = Integer.toString(allDocs.size());
0517 } else {
0518 docs = Integer.toString(maxDocumentInReport);
0519 }
0520 } else {
0521 docs = "All";
0522 }
0523 if (PRMatchingRegex.equals(MATCH_ALL_PR_REGEX)) {
0524 reportTitle = reportTitle
0525 + "<div style=\"font-size:15px;font-family:Verdana; color:navy;\">Top "
0526 + docs + " expensive documents matching All PRs in <b>"
0527 + pipelineName + "</b></div>" + NL;
0528 } else {
0529 if (matchingPRs.size() > 0) {
0530 reportTitle = reportTitle
0531 + "<div style=\"font-size:15px;font-family:Verdana; color:navy;\">Top "
0532 + docs + " expensive documents matching following PRs in <b>"
0533 + pipelineName + "</b> <ul>" + NL;
0534 for (String pr : matchingPRs) {
0535 reportTitle = reportTitle + "<li>" + pr + "</li>";
0536 }
0537 reportTitle = reportTitle + "</ul></div>";
0538 } else {
0539 reportTitle +=
0540 "<div style=\"font-size:15px;font-family:Verdana; color:navy;\">" +
0541 "No PRs matched to search string \"" +
0542 getPRMatchingRegex() + " \" in " + pipelineName + "</div>";
0543 }
0544 }
0545 reportTitle = reportTitle + hTrace;
0546
0547 if (allDocs.size() > 0) {
0548 String htmlReportTitle = reportTitle +
0549 "<table><tr bgcolor=\"#eeeeff\">" +
0550 "<td><b>Document Name</b></td>" +
0551 "<td><b>Time in seconds</b></td>" +
0552 "<td><b>% Time taken</b></td>" +
0553 "</tr><tr>" + NL;
0554 String documentNameHTMLString = "<td rowspan = '112' width = '550'>";
0555 String timeTakenHTMLString = "<td width = '100'>";
0556 String timeInPercentHTMLString = "<td width = '100'>";
0557 LinkedHashMap<String, Object> rcHash =
0558 (LinkedHashMap<String, Object>) reportSource;
0559 rcHash.remove("total");
0560 Iterator<String> i = rcHash.keySet().iterator();
0561 int count = 0;
0562 while (i.hasNext()) {
0563 Object key = i.next();
0564 if (!((String) key).equals("total")) {
0565 int value = Integer.parseInt((String) rcHash.get(key));
0566 if (maxDocumentInReport == ALL_DOCS) {
0567 documentNameHTMLString += "<div>" + key + "</div>";
0568 timeTakenHTMLString += "<div>" + value / 1000.0 + "</div>";
0569 timeInPercentHTMLString += "<div>"
0570 + Math.round(((value / globalTotal) * 100) * 10) / 10.0
0571 + "</div>" + NL;
0572 } else if (count < maxDocumentInReport) {
0573 documentNameHTMLString += "<div>" + key + "</div>";
0574 timeTakenHTMLString += "<div>" + value / 1000.0 + "</div>";
0575 timeInPercentHTMLString += "<div>"
0576 + Math.round(((value / globalTotal) * 100) * 10) / 10.0
0577 + "</div>" + NL;
0578 }
0579 }
0580 count++;
0581 }
0582 documentNameHTMLString +=
0583 "<div bgcolor=\"#eeeeff\" style = \"font-size:15px;margin-left:400px;\">" +
0584 "<b>Total</b></div></td>" + NL;
0585 timeTakenHTMLString +=
0586 "<div bgcolor=\"#eeeeff\" style = \"font-size:15px;\"><b>" +
0587 globalTotal / 1000.0 + "</b></div></td>" + NL;
0588 timeInPercentHTMLString +=
0589 "<div bgcolor=\"#eeeeff\" style = \"font-size:15px;\">" +
0590 "<b>100</b></div></td>" + NL;
0591
0592 if (!outputFile.exists()) {
0593 htmlReport += htmlReportTitle + documentNameHTMLString
0594 + timeTakenHTMLString + timeInPercentHTMLString + "</tr></table>";
0595 } else {
0596 htmlReport = "<br/><br/>" + htmlReportTitle + documentNameHTMLString
0597 + timeTakenHTMLString + timeInPercentHTMLString
0598 + "</tr></table></body></html>";
0599 }
0600 } else {
0601 htmlReport += reportTitle + "</body></html>";
0602 }
0603
0604 BufferedWriter out = null;
0605 try {
0606 out = new BufferedWriter(new FileWriter(outputFile));
0607 out.write(htmlReport);
0608
0609 } catch (IOException e) {
0610 e.printStackTrace();
0611
0612 } finally {
0613 try {
0614 if (out != null) { out.close(); }
0615 } catch (IOException e) {
0616 e.printStackTrace();
0617 }
0618 }
0619 }
0620
0621 /**
0622 * Ignores the inconsistent log entries from the benchmark file. Entries from
0623 * modules like pronominal coreferencer which have not been converted to new
0624 * benchmarking conventions are ignored.
0625 *
0626 * @param benchmarkIDChain
0627 * the chain of benchmark ids. This is the third token in the
0628 * benchmark file.
0629 * @param startTokens
0630 * an array of first tokens in the benchmark id chain.
0631 *
0632 * @return true if valid log entry; false otherwise.
0633 */
0634 private boolean validateLogEntry(String benchmarkIDChain,
0635 ArrayList<String> startTokens) {
0636 String startTokenRegExp = "(";
0637 for (int i = 0; i < startTokens.size(); i++) {
0638 if ((benchmarkIDChain.split("\\.")).length == 1
0639 && benchmarkIDChain.equals(startTokens.get(i))) {
0640 validEntries += 1;
0641 return true;
0642 }
0643 startTokenRegExp += startTokens.get(i) + "|";
0644 }
0645 if (startTokenRegExp.length() > 1) {
0646 startTokenRegExp = startTokenRegExp.substring(0, startTokenRegExp
0647 .length() - 1);
0648 }
0649 startTokenRegExp += ")";
0650 if (benchmarkIDChain.matches(startTokenRegExp + "\\.doc_.*?\\.pr_.*")) {
0651 validEntries += 1;
0652 return true;
0653 } else
0654 return false;
0655 }
0656
0657 /**
0658 * Parses the report command lime arguments.
0659 *
0660 * @param args array containing the command line arguments.
0661 */
0662 public void parseArguments(String[] args) {
0663 Getopt g = new Getopt("gate.util.reporting.DocTimeReporter", args,
0664 "i:m:d:p:o:l:h");
0665 int c;
0666 String argNoOfDocs = null;
0667 while ((c = g.getopt()) != -1) {
0668 switch (c) {
0669 // -i inputFile
0670 case 'i':
0671 String argInPath = g.getOptarg();
0672 if (argInPath != null) {
0673 setBenchmarkFile(new File(argInPath));
0674 }
0675 break;
0676 // -m printMedia
0677 case 'm':
0678 String argPrintMedia = g.getOptarg();
0679 if (argPrintMedia != null) {
0680 setPrintMedia(argPrintMedia);
0681 }
0682 break;
0683 // -d noOfDocs
0684 case 'd':
0685 argNoOfDocs = g.getOptarg();
0686 if (argNoOfDocs == null) {
0687 setMaxDocumentInReport(maxDocumentInReport);
0688 }
0689 break;
0690 // -p prName
0691 case 'p':
0692 String argPrName = g.getOptarg();
0693 if (argPrName != null) {
0694 setPRMatchingRegex(argPrName);
0695 } else {
0696 setPRMatchingRegex(PRMatchingRegex);
0697 }
0698 break;
0699 // -o Report File
0700 case 'o':
0701 String argOutPath = g.getOptarg();
0702 if (argOutPath != null) {
0703 setReportFile(new File(argOutPath));
0704 }
0705 break;
0706 // -l logical start
0707 case 'l':
0708 String argLogicalStart = g.getOptarg();
0709 if (argLogicalStart != null) {
0710 setLogicalStart(argLogicalStart);
0711 }
0712 break;
0713 // -h usage information
0714 case 'h':
0715 case '?':
0716 usage();
0717 System.exit(STATUS_NORMAL);
0718 break;
0719
0720 default:
0721 usage();
0722 System.exit(STATUS_ERROR);
0723 break;
0724
0725 } // getopt switch
0726 }
0727 if (argNoOfDocs != null) {
0728 try {
0729 setMaxDocumentInReport(Integer.parseInt(argNoOfDocs));
0730 } catch (NumberFormatException e) {
0731 e.printStackTrace();
0732 usage();
0733 System.exit(STATUS_ERROR);
0734 }
0735 }
0736 }
0737
0738 /**
0739 * Returns the name of the media on which report will be generated. e.g. text,
0740 * HTML.
0741 *
0742 * @return printMedia A String containing the name of the media on which
0743 * report will be generated.
0744 */
0745 public String getPrintMedia() {
0746 return printMedia;
0747 }
0748
0749 /**
0750 * Sets the media on which report will be generated.
0751 *
0752 * @param printMedia Type of media on which the report will be generated.
0753 * Must be MEDIA_TEXT or MEDIA_HTML.
0754 * The default is MEDIA_HTML.
0755 */
0756 public void setPrintMedia(String printMedia) {
0757 if (!printMedia.equals(MEDIA_HTML)
0758 && !printMedia.equals(MEDIA_TEXT)) {
0759 throw new IllegalArgumentException("Illegal argument: " + printMedia);
0760 }
0761 this.printMedia = printMedia.trim();
0762 }
0763
0764 /**
0765 * Provides the functionality to match a user input string with the PR in the
0766 * given benchmark ids.
0767 *
0768 * @param benchmarkIDs
0769 * A string of benchmarkIDs containing the PR name at the start of
0770 * string.
0771 * @param searchString
0772 * The string to be matched for PR name.
0773 *
0774 * @return boolean true if search string matches PR name; false otherwise.
0775 */
0776 private boolean isPRMatched(String benchmarkIDs, String searchString) {
0777 String prName = benchmarkIDs.split("\\.")[0];
0778 // Remove leading and trailing whitespaces of search string
0779 searchString = searchString.trim();
0780 // Remove "pr" or "pr_" appearing in start of the prName string
0781 searchString = searchString.replaceAll("^(pr|pr_)", "");
0782 // Replace underscores with a space in the search string
0783 searchString = searchString.replaceAll("_", " ");
0784 // Replace multiple spaces with a single space
0785 searchString = searchString.replaceAll("\\s+", " ");
0786 searchString = searchString.trim();
0787 // Remove "pr_" appearing in start of the prName string
0788 String processedPRName = prName.replaceAll("^pr_", "");
0789 // Replace underscores with a space in the prName
0790 processedPRName = processedPRName.replaceAll("_", " ");
0791 if (prName.startsWith("pr_")) {
0792 return processedPRName.matches("(?i).*" + searchString + ".*");
0793 } else {
0794 return false;
0795 }
0796 }
0797
0798 /**
0799 * A method for deleting a given file.
0800 *
0801 * @param fileToBeDeleted
0802 * A handle of the file to be deleted.
0803 * @throws BenchmarkReportFileAccessException
0804 * if a given file could not be deleted.
0805 */
0806 private void deleteFile(File fileToBeDeleted)
0807 throws BenchmarkReportFileAccessException {
0808 if (fileToBeDeleted.isFile()) {
0809 if (!fileToBeDeleted.delete()) {
0810 throw new BenchmarkReportFileAccessException(
0811 "Could not delete " + fileToBeDeleted.getAbsolutePath());
0812 }
0813 }
0814 }
0815
0816 /**
0817 * Provides the functionality to separate out pipeline specific benchmark
0818 * entries in separate temporary benchmark files in a temporary folder in the
0819 * current working directory.
0820 *
0821 * @param benchmarkFile
0822 * An object of type File representing the input benchmark file.
0823 * @param report
0824 * A file handle to the report file to be written.
0825 * @throws BenchmarkReportFileAccessException
0826 * if any error occurs while accessing the input benchmark file or
0827 * while splitting it.
0828 * @throws BenchmarkReportExecutionException
0829 * if the given input benchmark file is modified while generating
0830 * the report.
0831 */
0832 private void splitBenchmarkFile(File benchmarkFile, File report)
0833 throws BenchmarkReportFileAccessException,
0834 BenchmarkReportInputFileFormatException {
0835 File dir = temporaryDirectory;
0836 // Folder already exists; then delete all files in the temporary folder
0837 if (dir.isDirectory()) {
0838 File files[] = dir.listFiles();
0839 for (int count = 0; count < files.length; count++) {
0840 if (!files[count].delete()) {
0841 throw new BenchmarkReportFileAccessException(
0842 "Could not delete files in the folder \"" +
0843 temporaryDirectory + "\"");
0844 }
0845 }
0846 } else if (!dir.mkdir()) {
0847 throw new BenchmarkReportFileAccessException(
0848 "Could not create temporary folder \"" + temporaryDirectory + "\"");
0849 }
0850
0851 // delete report2 from the filesystem
0852 if (getPrintMedia().equalsIgnoreCase(MEDIA_TEXT)) {
0853 deleteFile(new File(report.getAbsolutePath() + ".txt"));
0854 } else if (getPrintMedia().equalsIgnoreCase(MEDIA_HTML)) {
0855 deleteFile(new File(report.getAbsolutePath() + ".html"));
0856 }
0857
0858 RandomAccessFile in = null;
0859 BufferedWriter out = null;
0860 try {
0861 String logEntry = "";
0862 long fromPos = 0;
0863
0864 // File benchmarkFileName;
0865 if (getLogicalStart() != null) {
0866 fromPos = tail(benchmarkFile, FILE_CHUNK_SIZE);
0867 }
0868 in = new RandomAccessFile(benchmarkFile, "r");
0869
0870 if (getLogicalStart() != null) {
0871 in.seek(fromPos);
0872 }
0873 ArrayList<String> startTokens = new ArrayList<String>();
0874 String lastStart = "";
0875 Pattern pattern = Pattern.compile("(\\d+) (\\d+) (.*) (.*) \\{(.*)\\}");
0876 Matcher matcher = null;
0877 File benchmarkFileName = null;
0878 while ((logEntry = in.readLine()) != null) {
0879 matcher = pattern.matcher(logEntry);
0880 String startToken = "";
0881 if (logEntry.matches(".*START.*")) {
0882 String[] splittedStartEntry = logEntry.split("\\s");
0883 if (splittedStartEntry.length > 2) {
0884 startToken = splittedStartEntry[2];
0885 } else {
0886 throw new BenchmarkReportInputFileFormatException(
0887 getBenchmarkFile() + " is invalid.");
0888 }
0889
0890 if (startToken.endsWith("Start")) {
0891 continue;
0892 }
0893 if (!startTokens.contains(startToken)) {
0894 // create a new file for the new pipeline
0895 startTokens.add(startToken);
0896 benchmarkFileName = new File(
0897 temporaryDirectory, startToken + "_benchmark.txt");
0898 if (!benchmarkFileName.createNewFile()) {
0899 throw new BenchmarkReportFileAccessException(
0900 "Could not create \"" + startToken + "_benchmark.txt"
0901 + "\" in directory named \"" + temporaryDirectory + "\"");
0902 }
0903 out = new BufferedWriter(new FileWriter(benchmarkFileName));
0904 out.write(logEntry);
0905 out.newLine();
0906 }
0907 }
0908 // if a valid benchmark entry then write it to the pipeline specific
0909 // file
0910 if (matcher != null
0911 && matcher.matches()
0912 && (validateLogEntry(matcher.group(3), startTokens) || logEntry
0913 .matches(".*documentLoaded.*"))) {
0914 startToken = matcher.group(3).split("\\.")[0];
0915 if (!(lastStart.equals(startToken))) {
0916 if (out != null) { out.close(); }
0917 benchmarkFileName = new File(
0918 temporaryDirectory, startToken + "_benchmark.txt");
0919 out = new BufferedWriter(new FileWriter(benchmarkFileName, true));
0920 }
0921 if (out != null) {
0922 out.write(logEntry);
0923 out.newLine();
0924 }
0925 lastStart = startToken;
0926 }
0927 }
0928
0929 } catch (IOException e) {
0930 e.printStackTrace();
0931
0932 } finally {
0933 try {
0934 if (in != null) { in.close(); }
0935 if (out != null) { out.close(); }
0936 } catch (IOException e) {
0937 e.printStackTrace();
0938 }
0939 }
0940 }
0941
0942 /**
0943 * A method for reading the file upside down.
0944 *
0945 * @param fileToBeRead
0946 * An object of the file to be read.
0947 * @param chunkSize
0948 * An integer specifying the size of the chunks in which file will be
0949 * read.
0950 * @return A long value pointing to the start position of the given file
0951 * chunk.
0952 */
0953 private long tail(File fileToBeRead, int chunkSize)
0954 throws BenchmarkReportInputFileFormatException {
0955 try {
0956 RandomAccessFile raf = new RandomAccessFile(fileToBeRead, "r");
0957 Vector<String> lastNlines = new Vector<String>();
0958 int delta = 0;
0959 long curPos = 0;
0960 curPos = raf.length() - 1;
0961 long fromPos;
0962 byte[] bytearray;
0963 while (true) {
0964 fromPos = curPos - chunkSize;
0965 if (fromPos <= 0) {
0966 raf.seek(0);
0967 bytearray = new byte[(int) curPos];
0968 raf.readFully(bytearray);
0969 if (parseLinesFromLast(bytearray, lastNlines, fromPos)) {
0970 if (fromPos < 0)
0971 fromPos = 0;
0972 }
0973 break;
0974 } else {
0975 raf.seek(fromPos);
0976 bytearray = new byte[chunkSize];
0977 raf.readFully(bytearray);
0978 if (parseLinesFromLast(bytearray, lastNlines, fromPos)) {
0979 break;
0980 }
0981 delta = ((String) lastNlines.get(lastNlines.size() - 1)).length();
0982 lastNlines.remove(lastNlines.size() - 1);
0983 curPos = fromPos + delta;
0984 }
0985 }
0986 if (fromPos < 0)
0987 throw new BenchmarkReportInputFileFormatException(getBenchmarkFile()
0988 + " does not contain a marker named "
0989 + getLogicalStart()
0990 + " indicating logical start of a run.");
0991 return fromPos;
0992
0993 } catch (IOException e) {
0994 e.printStackTrace();
0995 return -1;
0996 }
0997 }
0998
0999 /**
1000 * A method to ensure that the required line is read from the given file part.
1001 *
1002 * @param bytearray
1003 * A part of a file being read upside down.
1004 * @param lastNlines
1005 * A vector containing the lines extracted from file part.
1006 * @param fromPos
1007 * A long value indicating the start of a file part.
1008 *
1009 * @return true if marker indicating the logical start of run is found; false
1010 * otherwise.
1011 */
1012 private boolean parseLinesFromLast(byte[] bytearray,
1013 Vector<String> lastNlines, long fromPos) {
1014 String lastNChars = new String(bytearray);
1015 StringBuffer sb = new StringBuffer(lastNChars);
1016 lastNChars = sb.reverse().toString();
1017 StringTokenizer tokens = new StringTokenizer(lastNChars, NL);
1018 while (tokens.hasMoreTokens()) {
1019 StringBuffer sbLine = new StringBuffer(tokens.nextToken());
1020 lastNlines.add(sbLine.reverse().toString());
1021 if ((lastNlines.get(lastNlines.size() - 1))
1022 .trim().endsWith(getLogicalStart())) {
1023 return true;
1024 }
1025 }
1026 return false;
1027 }
1028
1029 /**
1030 * Display a usage message
1031 */
1032 public static void usage() {
1033 System.out.println(
1034 "Usage: java gate.util.reporting.DocTimeReporter [Options]" + NL
1035 + "\t Options:" + NL
1036 + "\t -i input file path (default: benchmark.txt in the execution directory)" + NL
1037 + "\t -m print media - html/text (default: html)" + NL
1038 + "\t -d number of docs, use -1 for all docs (default: 10 docs)" + NL
1039 + "\t -p processing resource name to be matched (default: all_prs)" + NL
1040 + "\t -o output file path (default: report.html/txt in the system temporary directory)" + NL
1041 + "\t -l logical start (not set by default)" + NL
1042 + "\t -h show help" + NL);
1043 } // usage()
1044
1045 /**
1046 * A main method which acts as a entry point while executing a report via
1047 * command line
1048 *
1049 * @param args
1050 * A string array containing the command line arguments.
1051 * @throws BenchmarkReportExecutionException
1052 * if a given input file is modified while generating the report.
1053 */
1054 public static void main(String[] args)
1055 throws BenchmarkReportInputFileFormatException,
1056 BenchmarkReportFileAccessException {
1057 // process command-line options
1058 DocTimeReporter reportTwo = new DocTimeReporter(args);
1059 reportTwo.generateReport();
1060 }
1061
1062 /**
1063 * Calls store, calculate and printReport for generating the actual report.
1064 */
1065 private void generateReport() throws BenchmarkReportInputFileFormatException,
1066 BenchmarkReportFileAccessException {
1067 Timer timer = null;
1068 try {
1069 TimerTask task = new FileWatcher(getBenchmarkFile()) {
1070 protected void onChange(File file) {
1071 throw new BenchmarkReportExecutionException(getBenchmarkFile()
1072 + " file has been modified while generating the report.");
1073 }
1074 };
1075 timer = new Timer();
1076 // repeat the check every second
1077 timer.schedule(task, new Date(), 1000);
1078
1079 if (reportFile == null) {
1080 reportFile = new File(System.getProperty("java.io.tmpdir"),
1081 "report." + ((printMedia.equals(MEDIA_HTML)) ? "html" : "txt"));
1082 }
1083 splitBenchmarkFile(getBenchmarkFile(), reportFile);
1084 if (validEntries == 0) {
1085 if (logicalStart != null) {
1086 throw new BenchmarkReportInputFileFormatException(
1087 "No valid log entries present in " + getBenchmarkFile() +
1088 " does not contain a marker named " + logicalStart + ".");
1089 } else {
1090 throw new BenchmarkReportInputFileFormatException(
1091 "No valid log entries present in "
1092 + getBenchmarkFile().getAbsolutePath());
1093 }
1094 }
1095 File dir = temporaryDirectory;
1096 // Folder already exists; then delete all files in the temporary folder
1097 if (dir.isDirectory()) {
1098 File files[] = dir.listFiles();
1099 for (int count = 0; count < files.length; count++) {
1100 File inFile = files[count];
1101 Object report2Container1 = store(inFile);
1102 Object report2Container2 = calculate(report2Container1);
1103 printReport(report2Container2, reportFile);
1104 }
1105 if (files.length > 0 && files[0].exists()) {
1106 if (!files[0].delete()) {
1107 System.err.println(files[0] + " was not possible to delete.");
1108 }
1109 }
1110 }
1111 } finally {
1112 if (timer != null) { timer.cancel(); }
1113 }
1114 }
1115
1116 /*
1117 * (non-Javadoc)
1118 *
1119 * @see gate.util.reporting.BenchmarkReportable#executeReport()
1120 */
1121 public void executeReport() throws BenchmarkReportInputFileFormatException,
1122 BenchmarkReportFileAccessException {
1123 generateReport();
1124 }
1125
1126 /**
1127 * Returns the marker indicating logical start of a run.
1128 *
1129 * @return logicalStart A String containing the marker indicating logical
1130 * start of a run.
1131 */
1132 public String getLogicalStart() {
1133 return logicalStart;
1134 }
1135
1136 /**
1137 * Sets optionally a string indicating the logical start of a run.
1138 *
1139 * @param logicalStart A String indicating the logical start of a run.
1140 * Useful when you you have marked different runs in
1141 * your benchmark file with this string at their start.
1142 * By default the value is null.
1143 */
1144 public void setLogicalStart(String logicalStart) {
1145 this.logicalStart = logicalStart;
1146 }
1147
1148 /**
1149 * @return benchmarkFile path to input benchmark file.
1150 * @see #setBenchmarkFile(java.io.File)
1151 */
1152 public File getBenchmarkFile() {
1153 return benchmarkFile;
1154 }
1155
1156 /**
1157 * Sets the input benchmark file from which the report is generated.
1158 * By default use the file named "benchmark.txt" from the application
1159 * execution directory.
1160 *
1161 * @param benchmarkFile Input benchmark file.
1162 */
1163 public void setBenchmarkFile(File benchmarkFile) {
1164 this.benchmarkFile = benchmarkFile;
1165 }
1166
1167 /**
1168 * @return reportFile file path where the report file is written.
1169 * @see #setReportFile(java.io.File)
1170 */
1171 public File getReportFile() {
1172 return reportFile;
1173 }
1174
1175 /**
1176 * If not set, the default is the file name "report.txt/html"
1177 * in the system temporary directory.
1178 *
1179 * @param reportFile file path to the report file to write.
1180 */
1181 public void setReportFile(File reportFile) {
1182 this.reportFile = reportFile;
1183 }
1184
1185 /**
1186 * Returns the maximum no of documents to be shown in the report.
1187 *
1188 * @return maxDocumentInReport An integer specifying the maximum no of
1189 * documents to be shown in the report.
1190 */
1191 public int getMaxDocumentInReport() {
1192 return maxDocumentInReport;
1193 }
1194
1195 /**
1196 * Maximum number of documents contained in the report.
1197 * @param maxDocumentInReport Maximum number of documents contained in
1198 * the report. Use the constant ALL_DOCS for reporting all documents.
1199 * The default is 10.
1200 */
1201 public void setMaxDocumentInReport(int maxDocumentInReport) {
1202 if (!(maxDocumentInReport > 0 || maxDocumentInReport == ALL_DOCS)) {
1203 throw new IllegalArgumentException(
1204 "Illegal argument: " + maxDocumentInReport);
1205 }
1206 this.maxDocumentInReport = maxDocumentInReport;
1207 }
1208
1209 /**
1210 * Returns the search string to be matched to PR names present in the log
1211 * entries.
1212 *
1213 * @return PRMatchingRegex A String to be matched to PR names present in the
1214 * log entries.
1215 */
1216 public String getPRMatchingRegex() {
1217 return PRMatchingRegex;
1218 }
1219
1220 /**
1221 * Search string to match PR names present in the benchmark file.
1222 *
1223 * @param matchingRegex regular expression to match PR names
1224 * present in the benchmark file. The default is MATCH_ALL_PR_REGEX.
1225 */
1226 public void setPRMatchingRegex(String matchingRegex) {
1227 PRMatchingRegex = matchingRegex;
1228 }
1229 }
1230
1231 /**
1232 * A FileWather class to check whether the file is modified or not at specified
1233 * interval.
1234 */
1235 abstract class FileWatcher extends TimerTask {
1236 private long timeStamp;
1237 private File file;
1238
1239 /**
1240 * Creates a FileWatcher on a given file.
1241 *
1242 * @param file
1243 * A handle of the file to be watched.
1244 */
1245 public FileWatcher(File file) {
1246 this.file = file;
1247 timeStamp = file.lastModified();
1248 }
1249
1250 /*
1251 * (non-Javadoc)
1252 *
1253 * @see java.util.TimerTask#run()
1254 */
1255 public final void run() {
1256 long oldTimeStamp = file.lastModified();
1257 if (timeStamp != oldTimeStamp) {
1258 cancel();
1259 onChange(file);
1260 }
1261 }
1262
1263 /**
1264 * Specifies the actions to be taken when a file is modified.
1265 *
1266 * @param file
1267 * A handle of the file to be watched.
1268 */
1269 protected abstract void onChange(File file)
1270 throws BenchmarkReportExecutionException;
1271 }
|