0001 /*
0002 * LuceneSearchThread.java
0003 *
0004 * Niraj Aswani, 19/March/07
0005 *
0006 * $Id: LuceneSearchThread.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
0007 */
0008 package gate.creole.annic.lucene;
0009
0010 import java.io.BufferedInputStream;
0011 import java.io.File;
0012 import java.io.FileInputStream;
0013 import java.io.InputStream;
0014 import java.io.ObjectInput;
0015 import java.io.ObjectInputStream;
0016 import java.util.ArrayList;
0017 import java.util.HashMap;
0018 import java.util.Iterator;
0019 import java.util.List;
0020 import java.util.Map;
0021
0022 import gate.creole.annic.Pattern;
0023 import gate.creole.annic.PatternAnnotation;
0024 import gate.creole.annic.Constants;
0025 import gate.creole.annic.SearchException;
0026 import gate.creole.annic.apache.lucene.search.Hits;
0027 import gate.creole.annic.apache.lucene.search.Query;
0028
0029 /**
0030 * Given a boolean query, it is translated into one or more AND
0031 * normalized queries. For example: (A|B)C is translated into AC and BC.
0032 * For each such query an instance of LuceneSearchThread is created.
0033 * Here, each query is issued separately and results are submitted to
0034 * main instance of LuceneSearch.
0035 *
0036 * @author niraj
0037 */
0038 public class LuceneSearchThread {
0039
0040 /**
0041 * Debug variable
0042 */
0043 private static boolean DEBUG = false;
0044
0045 /**
0046 * Number of base token annotations to be used in context.
0047 */
0048 private int contextWindow;
0049
0050 /**
0051 * The location of index.
0052 */
0053 private String indexLocation;
0054
0055 /**
0056 * Instance of a QueryParser.
0057 */
0058 private QueryParser queryParser;
0059
0060 /**
0061 * BaseTokenAnnotationType.
0062 */
0063 private String baseTokenAnnotationType;
0064
0065 /**
0066 * Instance of the LuceneSearcher.
0067 */
0068 private LuceneSearcher luceneSearcher;
0069
0070 /**
0071 * Indicates if searching process is finished.
0072 */
0073 public boolean finished = false;
0074
0075 /**
0076 * Index of the serializedFileID we are currently searching for.
0077 */
0078 private int serializedFileIDIndex = 0;
0079
0080 /**
0081 * QueryItemIndex
0082 */
0083 private int queryItemIndex = 0;
0084
0085 /**
0086 * List of serialized Files IDs retrieved from the lucene index
0087 */
0088 private List<String> serializedFilesIDsList = new ArrayList<String>();
0089
0090 /**
0091 * A Map that holds information about search results.
0092 */
0093 private Map<String, List<QueryItem>> searchResultInfoMap = new HashMap<String, List<QueryItem>>();
0094
0095 /**
0096 * First term position index.
0097 */
0098 private int ftpIndex = 0;
0099
0100 /**
0101 * Indicates if the query was success.
0102 */
0103 private boolean success = false;
0104
0105 /**
0106 * Indicates if we've reached the end of search results.
0107 */
0108 private boolean fwdIterationEnded = false;
0109
0110 /**
0111 * We keep track of what was the last ID of the serialized File that we visited. This is
0112 * used for optimization reasons
0113 */
0114 private String serializedFileIDInUse = null;
0115
0116 /**
0117 * This is where we store the tokenStreamInUse
0118 */
0119 private List<gate.creole.annic.apache.lucene.analysis.Token> tokenStreamInUse = null;
0120
0121 /**
0122 * Query
0123 */
0124 private String query = null;
0125
0126 /**
0127 * Given a file name, it replaces the all invalid characters with '_'.
0128 *
0129 * @param name
0130 * @return
0131 */
0132 private String getCompatibleName(String name) {
0133 return name.replaceAll("[\\/:\\*\\?\"<>|]", "_");
0134 }
0135
0136 /**
0137 * This method collects the necessary information from lucene and uses
0138 * it when the next method is called
0139 *
0140 * @param limit limit indicates the number of patterns to retrieve
0141 * @param query query supplied by the user
0142 * @param patternWindow number of tokens to refer on left and right
0143 * context
0144 * @param indexLocation location of the index the searcher should
0145 * search in
0146 * @param luceneSearcher an instance of lucene search from where the
0147 * instance of SearchThread is invoked
0148 * @return true iff search was successful false otherwise
0149 */
0150 public boolean search(String query, int patternWindow, String indexLocation,
0151 String corpusToSearchIn, String annotationSetToSearchIn,
0152 LuceneSearcher luceneSearcher) throws SearchException {
0153
0154 this.query = query;
0155 this.contextWindow = patternWindow;
0156 this.indexLocation = indexLocation;
0157 this.queryParser = new QueryParser();
0158 this.luceneSearcher = luceneSearcher;
0159
0160 /*
0161 * reset all parameters that keep track of where we are in our
0162 * searching. These parameters are used mostly to keep track of
0163 * where to start fetching the next results from
0164 */
0165 searchResultInfoMap = new HashMap<String, List<QueryItem>>();
0166 serializedFileIDIndex = 0;
0167 queryItemIndex = 0;
0168 serializedFilesIDsList = new ArrayList<String>();
0169 ftpIndex = -1;
0170 success = false;
0171 fwdIterationEnded = false;
0172
0173 try {
0174 // first find out the location of Index
0175 String temp = "";
0176 for(int i = 0; i < indexLocation.length(); i++) {
0177 if(indexLocation.charAt(i) == '\\') {
0178 temp += "/";
0179 }
0180 else {
0181 temp += indexLocation.charAt(i);
0182 }
0183 }
0184 indexLocation = temp;
0185
0186 /*
0187 * for each different location there can be different
0188 * baseTokenAnnotationType each index will have their index
0189 * Definition file stored under the index directory so first see
0190 * if given location is a valid directory
0191 */
0192 File locationFile = new File(indexLocation);
0193 if(!locationFile.isDirectory()) {
0194 System.out.println("Skipping the invalid Index Location :"
0195 + indexLocation);
0196 return false;
0197 }
0198
0199 if(!indexLocation.endsWith("/")) {
0200 indexLocation += "/";
0201 }
0202
0203 // otherwise let us read the index definition file
0204 locationFile = new File(indexLocation + "LuceneIndexDefinition.xml");
0205
0206 // check if this file is available
0207 if(!locationFile.exists()) {
0208 System.out
0209 .println("Index Definition file not found - Skipping the invalid Index Location :"
0210 + indexLocation + "LuceneIndexDefinition.xml");
0211 return false;
0212 }
0213
0214 java.io.FileReader fileReader = new java.io.FileReader(indexLocation
0215 + "LuceneIndexDefinition.xml");
0216
0217 HashMap indexInformation = null;
0218 try {
0219 // other wise read this file
0220 com.thoughtworks.xstream.XStream xstream = new com.thoughtworks.xstream.XStream(
0221 new com.thoughtworks.xstream.io.xml.StaxDriver());
0222
0223 // Saving was accomplished by using XML serialization of the map.
0224 indexInformation = (HashMap)xstream.fromXML(fileReader);
0225 }
0226 finally {
0227 fileReader.close();
0228 }
0229
0230 // find out if the current index was indexed by annicIndexPR
0231 String indexedWithANNICIndexPR = (String)indexInformation
0232 .get(Constants.CORPUS_INDEX_FEATURE);
0233
0234 if(indexedWithANNICIndexPR == null
0235 || !indexedWithANNICIndexPR
0236 .equals(Constants.CORPUS_INDEX_FEATURE_VALUE)) {
0237 System.out
0238 .println("This corpus was not indexed by Annic Index PR - Skipping the invalid Index");
0239 return false;
0240 }
0241
0242 // find out the baseTokenAnnotationType name
0243 baseTokenAnnotationType = ((String)indexInformation
0244 .get(Constants.BASE_TOKEN_ANNOTATION_TYPE)).trim();
0245
0246 int separatorIndex = baseTokenAnnotationType.lastIndexOf('.');
0247 if(separatorIndex >= 0) {
0248 baseTokenAnnotationType = baseTokenAnnotationType
0249 .substring(separatorIndex + 1);
0250 }
0251
0252 // create various Queries from the user's query
0253 Query[] luceneQueries = queryParser.parse("contents", query,
0254 baseTokenAnnotationType, corpusToSearchIn,
0255 annotationSetToSearchIn);
0256 if(queryParser.needValidation()) {
0257 if(DEBUG) System.out.println("Validation enabled!");
0258 }
0259 else {
0260 if(DEBUG) System.out.println("Validation disabled!");
0261 }
0262
0263 // create an instance of Index Searcher
0264 LuceneIndexSearcher searcher = new LuceneIndexSearcher(indexLocation);
0265
0266 try {
0267 // we need to iterate through one query at a time
0268 for(int luceneQueryIndex = 0; luceneQueryIndex < luceneQueries.length; luceneQueryIndex++) {
0269
0270 /*
0271 * this call reinitializes the first Term positions arraylists
0272 * which are being used to store the results
0273 */
0274 searcher.initializeTermPositions();
0275
0276 /*
0277 * and now execute the query result of which will be stored in
0278 * hits
0279 */
0280 Hits hits = searcher.search(luceneQueries[luceneQueryIndex]);
0281
0282 /*
0283 * and so now find out the positions of the first terms in the
0284 * returned results. first term position is the position of the
0285 * first term in the found pattern
0286 */
0287 ArrayList[] firstTermPositions = searcher.getFirstTermPositions();
0288 // if no result available, set null to our scores
0289 if(firstTermPositions[0].size() == 0) {
0290 // do nothing
0291 continue;
0292 }
0293
0294
0295
0296 // iterate through each result and collect necessary
0297 // information
0298 for(int hitIndex = 0; hitIndex < hits.length(); hitIndex++) {
0299 int index = firstTermPositions[0].indexOf(new Integer(hits
0300 .id(hitIndex)));
0301
0302 // we fetch all the first term positions for the query
0303 // issued
0304 ArrayList ftp = (ArrayList)firstTermPositions[1].get(index);
0305
0306 /*
0307 * pattern length (in terms of total number of annotations
0308 * following one other)
0309 */
0310 int patLen = ((Integer)firstTermPositions[2].get(index)).intValue();
0311
0312 /*
0313 * and the type of query (if it has only one annotation in it,
0314 * or multiple terms following them)
0315 */
0316 int qType = ((Integer)firstTermPositions[3].get(index)).intValue();
0317
0318 // find out the documentID
0319 String serializedFileID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE);
0320 QueryItem queryItem = new QueryItem();
0321 queryItem.annotationSetName = hits.doc(hitIndex).get(
0322 Constants.ANNOTATION_SET_ID).intern();
0323 queryItem.id = hits.id(hitIndex);
0324 queryItem.documentID = hits.doc(hitIndex).get(Constants.DOCUMENT_ID).intern();
0325 queryItem.ftp = ftp;
0326 queryItem.patLen = patLen;
0327 queryItem.qType = qType;
0328 queryItem.query = luceneQueries[luceneQueryIndex];
0329 queryItem.queryString = queryParser.getQueryString(luceneQueryIndex).intern();
0330
0331 /*
0332 * all these information go in the top level arrayList. we
0333 * create separate arrayList for each individual document
0334 * where each element in the arrayList provides information
0335 * about different query issued over it
0336 */
0337 List<QueryItem> queryItemsList = searchResultInfoMap.get(serializedFileID);
0338 if(queryItemsList == null) {
0339 queryItemsList = new ArrayList<QueryItem>();
0340 queryItemsList.add(queryItem);
0341 searchResultInfoMap.put(serializedFileID, queryItemsList);
0342 serializedFilesIDsList.add(serializedFileID);
0343 }
0344 else {
0345 // // before inserting we check if it is already added
0346 // if(!doesAlreadyExist(queryItem, queryItemsList)) {
0347 queryItemsList.add(queryItem);
0348 // }
0349 }
0350 }
0351 }
0352 }
0353 finally {
0354 searcher.close();
0355 }
0356 // if any result possible, return true
0357 if(searchResultInfoMap.size() > 0)
0358 success = true;
0359 else success = false;
0360 }
0361 catch(Exception e) {
0362 throw new SearchException(e);
0363 }
0364
0365 return success;
0366 }
0367
0368 /**
0369 * First term positions.
0370 */
0371 private List ftp;
0372
0373 /**
0374 * This method returns a list containing instances of Pattern
0375 *
0376 * @param numberOfResults the number of results to fetch
0377 * @return a list of QueryResult
0378 * @throws Exception
0379 */
0380 public List<Pattern> next(int numberOfResults) throws Exception {
0381
0382 /*
0383 * We check here, if there were no results found, we return null
0384 */
0385 if(!success) {
0386 return null;
0387 }
0388
0389 if(fwdIterationEnded) {
0390 return null;
0391 }
0392
0393 int noOfResultsToFetch = numberOfResults;
0394 List<Pattern> toReturn = new ArrayList<Pattern>();
0395
0396 // iterator over one document ID
0397 for(; serializedFileIDIndex < serializedFilesIDsList.size(); serializedFileIDIndex++, queryItemIndex = 0, this.ftp = null) {
0398
0399 // deal with one document at a time
0400 String serializedFileID = serializedFilesIDsList.get(serializedFileIDIndex);
0401
0402 // obtain the information about all queries
0403 List<QueryItem> queryItemsList = searchResultInfoMap.get(serializedFileID);
0404 if(queryItemsList.isEmpty()) continue;
0405 String folder = queryItemsList.get(0).documentID.intern();
0406
0407 if(serializedFileIDInUse == null || !serializedFileIDInUse.equals(serializedFileID)
0408 || tokenStreamInUse == null) {
0409 serializedFileIDInUse = serializedFileID;
0410 try {
0411 // this is the first and last time we want this tokenStream
0412 // to hold information about the current document
0413 tokenStreamInUse = getTokenStreamFromDisk(indexLocation,getCompatibleName(folder),
0414 getCompatibleName(serializedFileID));
0415 }
0416 catch(Exception e) {
0417 continue;
0418 }
0419 }
0420
0421 // deal with one query at a time
0422 for(; queryItemIndex < queryItemsList.size(); queryItemIndex++, ftpIndex = -1, this.ftp = null) {
0423 QueryItem queryItem = (QueryItem)queryItemsList.get(queryItemIndex);
0424
0425 /*
0426 * we've found the tokenStream and now we need to convert it
0427 * into the format we had at the time of creating index.. the
0428 * method getTokenStream(...) returns an array of arraylists
0429 * where the first object is GateAnnotations of that pattern
0430 * only second object is the position of the first token of the
0431 * actual pattern third object is the lenght of the actual
0432 * pattern
0433 */
0434 int qType = queryItem.qType;
0435 int patLen = queryItem.patLen;
0436 if(this.ftp == null) {
0437 this.ftp = queryItem.ftp;
0438 }
0439 else {
0440 qType = 1;
0441 patLen = 1;
0442 }
0443 PatternResult patternResult = getPatternResult(tokenStreamInUse,
0444 queryItem.annotationSetName, patLen, qType, contextWindow,
0445 queryItem.queryString, baseTokenAnnotationType,
0446 noOfResultsToFetch);
0447
0448 /*
0449 * if none of the found patterns is valid continue with the next
0450 * query
0451 */
0452 if(patternResult == null || patternResult.numberOfPatterns == 0)
0453 continue;
0454
0455 /*
0456 * We've found some patterns so give its effect to
0457 * noOfResultsToFetch
0458 */
0459 if(noOfResultsToFetch != -1)
0460 noOfResultsToFetch -= patternResult.numberOfPatterns;
0461
0462 List<Pattern> annicPatterns = createAnnicPatterns(new LuceneQueryResult(
0463 removeUnitNumber(serializedFileID), patternResult.annotationSetName,
0464 patternResult.firstTermPositions, patternResult.patternLegths,
0465 queryItem.qType, patternResult.gateAnnotations,
0466 queryItem.queryString));
0467 toReturn.addAll(annicPatterns);
0468
0469 /*
0470 * If noOfResultsToFetch is 0, it means the search should
0471 * terminate unless and otherwise user has asked to return all
0472 * (-1)
0473 */
0474 if(numberOfResults != -1 && noOfResultsToFetch == 0) {
0475 return toReturn;
0476 }
0477 }
0478 }
0479
0480 /*
0481 * if we are out of the loop set success to false such that this
0482 * thread is closed
0483 */
0484 fwdIterationEnded = true;
0485 return toReturn;
0486 }
0487
0488 /**
0489 * Given an object of luceneQueryResult this method for each found
0490 * pattern, converts it into the annic pattern. In other words, for
0491 * each pattern it collects the information such as annotations in
0492 * context and so on.
0493 *
0494 * @param aResult
0495 * @return
0496 */
0497 private List<Pattern> createAnnicPatterns(LuceneQueryResult aResult) {
0498 // get the result from search engine
0499 List<Pattern> annicPatterns = new ArrayList<Pattern>();
0500 List firstTermPositions = aResult.getFirstTermPositions();
0501 if(firstTermPositions != null && firstTermPositions.size() > 0) {
0502 List<Integer> patternLength = aResult.patternLength();
0503 // locate Pattern
0504 List<Pattern> pats = locatePatterns((String)aResult.getDocumentID(),
0505 aResult.getAnnotationSetName(), aResult.getGateAnnotations(),
0506 firstTermPositions, patternLength, aResult.getQuery());
0507 if(pats != null) {
0508 annicPatterns.addAll(pats);
0509 }
0510 }
0511 return annicPatterns;
0512 }
0513
0514 /**
0515 * Locates the valid patterns in token stream and discards the invalid
0516 * first term positions returned by the lucene searcher.
0517 *
0518 * @param docID
0519 * @param gateAnnotations
0520 * @param firstTermPositions
0521 * @param patternLength
0522 * @param queryString
0523 * @return
0524 */
0525 private List<Pattern> locatePatterns(String docID, String annotationSetName,
0526 List<List<PatternAnnotation>> gateAnnotations,
0527 List firstTermPositions, List<Integer> patternLength,
0528 String queryString) {
0529
0530 // patterns
0531 List<Pattern> pats = new ArrayList<Pattern>();
0532 outer: for(int i = 0; i < gateAnnotations.size(); i++) {
0533
0534 // each element in the tokens stream is a pattern
0535 List<PatternAnnotation> annotations = gateAnnotations.get(i);
0536 if(annotations.size() == 0) {
0537 continue;
0538 }
0539 // from this annotations we need to create a text string
0540 // so lets find out the smallest and the highest offsets
0541 int smallest = Integer.MAX_VALUE;
0542 int highest = -1;
0543 for(int j = 0; j < annotations.size(); j++) {
0544 // each annotation is an instance of GateAnnotation
0545 PatternAnnotation ga = annotations.get(j);
0546 if(ga.getStartOffset() < smallest) {
0547 smallest = ga.getStartOffset();
0548 }
0549
0550 if(ga.getEndOffset() > highest) {
0551 highest = ga.getEndOffset();
0552 }
0553 }
0554
0555 // we have smallest and highest offsets
0556 char[] patternText = new char[highest - smallest];
0557
0558 for(int j = 0; j < patternText.length; j++) {
0559 patternText[j] = ' ';
0560 }
0561
0562 // and now place the text
0563 for(int j = 0; j < annotations.size(); j++) {
0564 // each annotation is an instance of GateAnnotation
0565 PatternAnnotation ga = annotations.get(j);
0566 if(ga.getText() == null) {
0567 // this is to avoid annotations such as split
0568 continue;
0569 }
0570
0571 for(int k = ga.getStartOffset() - smallest, m = 0; m < ga.getText()
0572 .length()
0573 && k < patternText.length; m++, k++) {
0574 patternText[k] = ga.getText().charAt(m);
0575 }
0576
0577 // we will initiate the annotTypes as well
0578 if(luceneSearcher.annotationTypesMap.keySet().contains(ga.getType())) {
0579 List<String> aFeatures = luceneSearcher.annotationTypesMap.get(ga
0580 .getType());
0581 Map<String, String> features = ga.getFeatures();
0582 if(features != null) {
0583 Iterator<String> fSet = features.keySet().iterator();
0584 while(fSet.hasNext()) {
0585 String feature = fSet.next();
0586 if(!aFeatures.contains(feature)) {
0587 aFeatures.add(feature);
0588 }
0589 }
0590 }
0591 luceneSearcher.annotationTypesMap.put(ga.getType(), aFeatures);
0592 }
0593 else {
0594 Map<String, String> features = ga.getFeatures();
0595 List<String> aFeatures = new ArrayList<String>();
0596 aFeatures.add("All");
0597 if(features != null) {
0598 aFeatures.addAll(features.keySet());
0599 }
0600 luceneSearcher.annotationTypesMap.put(ga.getType(), aFeatures);
0601 }
0602 // end of initializing annotationTypes for the comboBox
0603 }
0604
0605 // we have the text
0606 // smallest is the textStOffset
0607 // highest is the textEndOffset
0608 // how to find the patternStartOffset
0609 int stPos = ((Integer)firstTermPositions.get(i)).intValue();
0610 int endOffset = ((Integer)patternLength.get(i)).intValue();
0611 int patStart = Integer.MAX_VALUE;
0612
0613 for(int j = 0; j < annotations.size(); j++) {
0614 // each annotation is an instance of GateAnnotation
0615 PatternAnnotation ga = annotations.get(j);
0616 if(ga.getPosition() == stPos) {
0617 if(ga.getStartOffset() < patStart) {
0618 patStart = ga.getStartOffset();
0619 }
0620 }
0621 }
0622
0623 if(patStart == Integer.MAX_VALUE) {
0624 continue;
0625 }
0626
0627 if(patStart < smallest || endOffset > highest) {
0628 continue;
0629 }
0630
0631 // now create the pattern for this
0632 Pattern ap = new Pattern(docID, annotationSetName,
0633 new String(patternText), patStart, endOffset, smallest, highest,
0634 annotations, queryString);
0635 pats.add(ap);
0636 }
0637 return pats;
0638 }
0639
0640 /**
0641 * Each index unit is first converted into a separate lucene document.
0642 * And a new ID with documentName and a unit number is assined to it.
0643 * But when we return results, we take the unit number out.
0644 *
0645 * @param documentID
0646 * @return
0647 */
0648 private String removeUnitNumber(String documentID) {
0649 int index = documentID.lastIndexOf("-");
0650 if(index == -1) return documentID;
0651 return documentID.substring(0, index);
0652 }
0653
0654 /**
0655 * This method looks on the disk to find the tokenStream
0656 *
0657 * @param location String
0658 * @throws Exception
0659 * @return ArrayList
0660 */
0661 private List<gate.creole.annic.apache.lucene.analysis.Token> getTokenStreamFromDisk(
0662 String indexDirectory, String documentFolder, String documentID) throws Exception {
0663 if(indexDirectory.startsWith("file:/"))
0664 indexDirectory = indexDirectory.substring(6, indexDirectory.length());
0665
0666 // use buffering
0667 File folder = new File(indexDirectory, Constants.SERIALIZED_FOLDER_NAME);
0668 folder = new File(folder, documentFolder);
0669 File fileToLoad = new File(folder, documentID + ".annic");
0670 InputStream file = new FileInputStream(fileToLoad);
0671 InputStream buffer = new BufferedInputStream(file);
0672 ObjectInput input = new ObjectInputStream(buffer);
0673
0674 // deserialize the List
0675 List<gate.creole.annic.apache.lucene.analysis.Token> recoveredTokenStream =
0676 (List<gate.creole.annic.apache.lucene.analysis.Token>)input.readObject();
0677 if(input != null) {
0678 // close "input" and its underlying streams
0679 input.close();
0680 }
0681 return recoveredTokenStream;
0682 }
0683
0684 /**
0685 * this method takes the tokenStream as a text, the first term
0686 * positions, pattern length, queryType and patternWindow and returns
0687 * the GateAnnotations as an array for each pattern with left and
0688 * right context
0689 *
0690 * @param subTokens
0691 * @param ftp
0692 * @param patLen
0693 * @param qType
0694 * @param patWindow
0695 * @param query
0696 * @param baseTokenAnnotationType
0697 * @return
0698 */
0699 private PatternResult getPatternResult(
0700 List<gate.creole.annic.apache.lucene.analysis.Token> subTokens,
0701 String annotationSetName, int patLen, int qType, int patWindow,
0702 String query, String baseTokenAnnotationType,
0703 int numberOfResultsToFetch) {
0704
0705 /*
0706 * ok so we first see what kind of query is that two possibilities
0707 * (Phrase query or Term query) Term query is what contains only one
0708 * word to seach and Phrase query contains more than one word 1
0709 * indicates the PhraseQuery
0710 */
0711 if(qType == 1) {
0712 return getPatternResult(subTokens, annotationSetName, patLen, patWindow,
0713 query, baseTokenAnnotationType, numberOfResultsToFetch);
0714 }
0715 else {
0716 /*
0717 * where the query is Term. In term query it is possible that user
0718 * is searching for the particular annotation type (say: "Token"
0719 * or may be for text (say: "Hello") query parser converts the
0720 * annotation type query into Token == "*" and the latter to
0721 * Token.string == "Hello"
0722 */
0723
0724 /*
0725 * the first element is text. the second element is type
0726 */
0727 String annotText = (String)ftp.get(0);
0728 String annotType = (String)ftp.get(1);
0729
0730 // so here we search through subTokens and find out the positions
0731 List<Integer> positions = new ArrayList<Integer>();
0732 for(int j = 0; j < subTokens.size(); j++) {
0733 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(j);
0734 String type = token.termText();
0735 String text = token.type();
0736
0737 // if annotType == "*", the query was {AnnotType}
0738 if(annotType.equals("*")) {
0739 if(type.equals(annotText) && annotType.equals(text)) {
0740 positions.add(new Integer(token.getPosition()));
0741 }
0742 }
0743 // the query is Token == "string"
0744 else {
0745 if(annotText.equals(type) && annotType.equals(text)) {
0746 positions.add(new Integer(token.getPosition()));
0747 }
0748 }
0749 }
0750
0751 this.ftp = positions;
0752 // we have positions here
0753 return getPatternResult(subTokens, annotationSetName, 1, patWindow,
0754 query, baseTokenAnnotationType, numberOfResultsToFetch);
0755 }
0756 }
0757
0758 /**
0759 * This method returns the valid patterns back and the respective
0760 * GateAnnotations
0761 *
0762 * @param subTokens ArrayList
0763 * @param ftp ArrayList
0764 * @param patLen int
0765 * @param patWindow int
0766 * @param query String
0767 * @return PatternResult
0768 */
0769 private PatternResult getPatternResult(
0770 List<gate.creole.annic.apache.lucene.analysis.Token> subTokens,
0771 String annotationSetName, int patLen, int patWindow, String query,
0772 String baseTokenAnnotationType, int noOfResultsToFetch) {
0773
0774 List<List<PatternAnnotation>> tokens = new ArrayList<List<PatternAnnotation>>();
0775 List<Integer> patLens = new ArrayList<Integer>();
0776 ftpIndex++;
0777
0778 // Phrase Query
0779 // consider only one pattern at a time
0780
0781 // first term position index at the begining
0782 int ftpIndexATB = ftpIndex;
0783 mainForLoop: for(; ftpIndex < ftp.size()
0784 && (noOfResultsToFetch == -1 || noOfResultsToFetch > 0); ftpIndex++) {
0785
0786 // find out the position of the first term
0787 int pos = ((Integer)ftp.get(ftpIndex)).intValue();
0788
0789 // find out the token with pos
0790 int j = 0;
0791 for(; j < subTokens.size(); j++) {
0792 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(j);
0793 if(token.getPosition() == pos) {
0794 break;
0795 }
0796 }
0797
0798 int counter = 0;
0799 int leftstart = -1;
0800 /*
0801 * ok so we need to go back to find out the first token of the
0802 * left context
0803 */
0804 int k = j - 1;
0805 for(; k >= 0; k--) {
0806 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(k);
0807 if(token.getPosition() < pos
0808 && token.termText().equals(baseTokenAnnotationType)
0809 && token.type().equals("*")) {
0810 counter++;
0811 leftstart = token.startOffset();
0812 j = k;
0813 }
0814 if(counter == patWindow) {
0815 break;
0816 }
0817 }
0818
0819 // j holds the start of the left context
0820
0821 // now we want to search for the end of left context
0822 pos--;
0823 k = j;
0824
0825 if(leftstart > -1) {
0826
0827 boolean breakNow = false;
0828 for(; k < subTokens.size(); k++) {
0829 gate.creole.annic.apache.lucene.analysis.Token token = subTokens
0830 .get(k);
0831 if(token.getPosition() == pos) {
0832 breakNow = true;
0833 }
0834 else {
0835 if(breakNow) {
0836 break;
0837 }
0838 }
0839 }
0840 }
0841 // now k holds the begining of the pattern
0842
0843 // leftEnd holds the position of the last token in left context
0844 int leftEnd = leftstart == -1 ? -1 : k - 1;
0845
0846 /*
0847 * we need to validate this pattern. As a result of query, we get
0848 * the positions of the first term. We need to locate the full
0849 * pattern along with all its other annotations. This is done by
0850 * using the ValidatePattern class. This class provides a method,
0851 * which takes as arguments the query Tokens, the position in the
0852 * tokenStream from where to start searching and returns the end
0853 * offset of the last annotation in the found pattern. We then
0854 * search for this endoffset in our current tokenStream to
0855 * retrieve the wanted annotations.
0856 */
0857 int upto = -1;
0858 int tempPos = 0;
0859 if(this.queryParser.needValidation()) {
0860
0861 try {
0862
0863 List<String> queryTokens = luceneSearcher.getQueryTokens(query);
0864 if(queryTokens == null) {
0865 queryTokens = new QueryParser().findTokens(query);
0866 luceneSearcher.addQueryTokens(query, queryTokens);
0867 }
0868
0869 /*
0870 * validate method returns the endoffset of the last token of
0871 * the middle pattern returns -1 if pattern could not be
0872 * located at that location
0873 */
0874 PatternValidator vp = new PatternValidator();
0875
0876 // here k is the position where the first token should occur
0877
0878 upto = vp.validate(queryTokens, subTokens, k, new QueryParser());
0879 if(upto == -1) {
0880 /*
0881 * if the validatePAttern class could not find the valid
0882 * pattern it returns -1 and therefore we should remove the
0883 * position of the invalid pattern
0884 */
0885 ftp.remove(ftpIndex);
0886 ftpIndex--;
0887 continue mainForLoop;
0888 }
0889 else {
0890 /*
0891 * now we need to locate the token whose endPosition is upto
0892 */
0893 int jj = leftEnd + 1;
0894 boolean breaknow = false;
0895 tempPos = subTokens.get(jj).getPosition();
0896 for(; jj < subTokens.size(); jj++) {
0897 gate.creole.annic.apache.lucene.analysis.Token token = subTokens
0898 .get(jj);
0899 if(token.endOffset() == upto) {
0900 tempPos = token.getPosition();
0901 breaknow = true;
0902 }
0903 else if(breaknow) {
0904 break;
0905 }
0906 }
0907 // we send the endoffset to our GUI class
0908 patLens.add(new Integer(upto));
0909
0910 /*
0911 * k holds the position of the first token in right context
0912 */
0913 k = jj;
0914 }
0915 }
0916 catch(Exception e) {
0917 e.printStackTrace();
0918 }
0919 }
0920 else {
0921 /*
0922 * the query contains all tokens, which is already validated at
0923 * the time of creating query the pointer k points to the
0924 * begining of our patern we need to travel patLen into the
0925 * right direction to obtain the pattern
0926 */
0927 for(counter = 0; counter < patLen && k < subTokens.size(); k++) {
0928 gate.creole.annic.apache.lucene.analysis.Token token = subTokens
0929 .get(k);
0930 if(token.termText().equals(baseTokenAnnotationType)
0931 && token.type().equals("*")) {
0932 counter++;
0933 upto = token.endOffset();
0934 tempPos = token.getPosition();
0935 }
0936 }
0937 patLens.add(new Integer(upto));
0938 k++;
0939 }
0940 int maxEndOffset = upto;
0941
0942 /*
0943 * so now search for the token with the position == tempPos + 1 in
0944 * other words search for the first term of the right context
0945 */
0946 for(; k < subTokens.size(); k++) {
0947 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(k);
0948 if(token.getPosition() == tempPos + 1) {
0949 break;
0950 }
0951 }
0952
0953 // and now we need to locate the right context pattern
0954 counter = 0;
0955 for(; k < subTokens.size(); k++) {
0956 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(k);
0957 if(token.startOffset() >= upto
0958 && token.termText().equals(baseTokenAnnotationType)
0959 && token.type().equals("*")) {
0960 counter++;
0961 maxEndOffset = token.endOffset();
0962 }
0963 if(counter == patWindow) {
0964 break;
0965 }
0966 }
0967
0968 // if there are any sub-tokens left
0969 if(k < subTokens.size()) {
0970 /*
0971 * now we would search for the position untill we see it having
0972 * the same position
0973 */
0974 tempPos = subTokens.get(k).getPosition();
0975
0976 for(; k < subTokens.size(); k++) {
0977 gate.creole.annic.apache.lucene.analysis.Token token = subTokens
0978 .get(k);
0979 if(token.getPosition() != tempPos) {
0980 break;
0981 }
0982 }
0983 }
0984
0985 if(k >= subTokens.size()) {
0986 // we used all sub-tokens - set k to maximum size
0987 k = subTokens.size() - 1;
0988 }
0989
0990 /*
0991 * so k is the position til where we need to search for each
0992 * annotation and every feature in it at the time of creating
0993 * index were converted into separate tokens we need to convert
0994 * them back into annotations
0995 */
0996 List<PatternAnnotation> patternGateAnnotations = new ArrayList<PatternAnnotation>();
0997 PatternAnnotation ga = null;
0998 for(int m = j; m <= k; m++) {
0999 gate.creole.annic.apache.lucene.analysis.Token token = subTokens.get(m);
1000 String text = token.termText();
1001 int st = token.startOffset();
1002 int end = token.endOffset();
1003 String type = token.type();
1004 int position = token.getPosition();
1005
1006 // if this is a new annotation Type
1007 if(type.equals("*")) {
1008 ga = new PatternAnnotation();
1009 ga.setType(text);
1010 ga.setStOffset(st);
1011 ga.setEnOffset(end);
1012 ga.setPosition(position);
1013 if(ga.getEndOffset() <= maxEndOffset) {
1014 patternGateAnnotations.add(ga);
1015 }
1016 continue;
1017 } else if(type.equals("**")) {
1018 continue;
1019 }
1020
1021 // and from here all are the features
1022 int index = type.indexOf(".");
1023 String feature = type.substring(index + 1, type.length());
1024 /*
1025 * we need to compare the type1 each annotation has string
1026 * feature in index so text will be definitely going to be
1027 * initialized
1028 */
1029 if(feature.equals("string")) {
1030 ga.setText(text);
1031 }
1032 ga.addFeature(feature, text);
1033 }
1034 tokens.add(patternGateAnnotations);
1035 if(noOfResultsToFetch != -1) noOfResultsToFetch--;
1036 }
1037
1038 if(noOfResultsToFetch == 0 && ftpIndex < ftp.size()) ftpIndex--;
1039
1040 // finally create an instance of PatternResult
1041 PatternResult pr = new PatternResult();
1042 pr.annotationSetName = annotationSetName;
1043 pr.gateAnnotations = tokens;
1044 pr.firstTermPositions = new ArrayList();
1045 for(int i = 0; i < pr.gateAnnotations.size(); i++) {
1046 pr.firstTermPositions.add(ftp.get(i + ftpIndexATB));
1047 }
1048 pr.patternLegths = patLens;
1049 pr.numberOfPatterns = pr.gateAnnotations.size();
1050 return pr;
1051 }
1052
1053 /**
1054 * Inner class to store pattern results.
1055 *
1056 * @author niraj
1057 */
1058 private class PatternResult {
1059 int numberOfPatterns;
1060
1061 List<List<PatternAnnotation>> gateAnnotations;
1062
1063 String annotationSetName;
1064
1065 List firstTermPositions;
1066
1067 List<Integer> patternLegths;
1068 }
1069
1070 /**
1071 * Inner class to store query Item.
1072 *
1073 * @author niraj
1074 *
1075 */
1076 private class QueryItem {
1077 float score;
1078
1079 int id;
1080
1081 String documentID;
1082
1083 List ftp;
1084
1085 int patLen;
1086
1087 int qType;
1088
1089 Query query;
1090
1091 String queryString;
1092
1093 String annotationSetName;
1094
1095 // public boolean equals(Object m) {
1096 // if(m instanceof QueryItem) {
1097 // QueryItem n = (QueryItem)m;
1098 // // no need to compare documentID as we don't compare documents with different docIDs anyway
1099 // return n.score == score && n.id == id && n.patLen == patLen
1100 // && n.qType == qType && n.ftp.size() == ftp.size()
1101 // && n.queryString.equals(queryString)
1102 // && n.annotationSetName.equals(annotationSetName)
1103 // && areTheyEqual(n.ftp, ftp, qType);
1104 // }
1105 // return false;
1106 // }
1107 }
1108
1109 /**
1110 * Checks if the QueryItem already exists.
1111 *
1112 * @param n
1113 * @param top
1114 * @return
1115 */
1116 // private boolean doesAlreadyExist(QueryItem n, List<QueryItem> top) {
1117 //
1118 // for(int i = 0; i < top.size(); i++) {
1119 // QueryItem m = top.get(i);
1120 // if(m.equals(n)) return true;
1121 // }
1122 // return false;
1123 // }
1124
1125 /**
1126 * Checks if two first term positions are identical.
1127 * @param ftp
1128 * @param ftp1
1129 * @param qType
1130 * @return
1131 */
1132 private boolean areTheyEqual(List ftp, List ftp1, int qType) {
1133 if(qType == 1) {
1134 if(ftp.size() == ftp1.size()) {
1135 for(int i = 0; i < ftp.size(); i++) {
1136 int pos = ((Integer)ftp.get(i)).intValue();
1137 int pos1 = ((Integer)ftp1.get(i)).intValue();
1138 if(pos != pos1) return false;
1139 }
1140 return true;
1141 }
1142 else {
1143 return false;
1144 }
1145 }
1146 else {
1147 String annotText = (String)ftp.get(0);
1148 String annotType = (String)ftp.get(1);
1149 String annotText1 = (String)ftp1.get(0);
1150 String annotType1 = (String)ftp1.get(1);
1151 return annotText1.equals(annotText) && annotType1.equals(annotType);
1152 }
1153 }
1154
1155 /**
1156 * Gets the query.
1157 *
1158 * @return
1159 */
1160 public String getQuery() {
1161 return query;
1162 }
1163
1164 }
|