001 /*
002 * QueryParser.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: QueryParser.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic.lucene;
009
010 import gate.creole.annic.Constants;
011 import gate.creole.annic.apache.lucene.search.*;
012 import gate.creole.annic.apache.lucene.index.*;
013 import gate.creole.ir.SearchException;
014
015 import java.util.*;
016
017 /**
018 * QueryParser parses the provided ANNIC Query and converts it into the
019 * format understood to Lucene.
020 *
021 * @author niraj
022 *
023 */
024 public class QueryParser {
025
026 /**
027 * Queries generated as a result of normalizing the submitted query.
028 */
029 private ArrayList queries = new ArrayList();
030
031 /**
032 * Name of the field that contains the index data.
033 */
034 private String field = "";
035
036 /**
037 * Base token annotation type.
038 */
039 private String baseTokenAnnotationType = "Token";
040
041 /**
042 * Indicates if we need to valid results returned by lucene.
043 */
044 private boolean needValidation = true;
045
046 /**
047 * Constructor
048 */
049 public QueryParser() {
050 position = 0;
051 }
052
053 public static void main(String[] args) {
054 System.out.println(isValidQuery(args[0]));
055 }
056
057 /**
058 * Returns true if the submitted query is valid.
059 *
060 * @param query
061 * @return
062 */
063 public static boolean isValidQuery(String query) {
064 QueryParser qp = new QueryParser();
065 try {
066 qp.parse("contents", query, "Token", null, null);
067 }
068 catch(SearchException se) {
069 return false;
070 }
071 return true;
072 }
073
074 /**
075 * Given a query, this method parses it to convert it into one or more
076 * lucene queries.
077 *
078 * @param field
079 * @param query
080 * @param baseTokenAnnotationType
081 * @param corpusID
082 * @return
083 * @throws gate.creole.ir.SearchException
084 */
085 public Query[] parse(String field, String query,
086 String baseTokenAnnotationType, String corpusID, String annotationSetToSearchIn)
087 throws gate.creole.ir.SearchException {
088 this.field = field;
089 this.baseTokenAnnotationType = baseTokenAnnotationType;
090 this.position = 0;
091 // at the moment this supports only | operator
092 // it also support klene operators * and +
093 // implicit operator is &
094 // It supports simple String queries
095 // it supports eight kinds of tokens
096 // 1. String (without quotes)
097 // 2. "String" (with quotes)
098 // 3. {AnnotationType}
099 // 4. {AnnotationType==String}
100 // 5. {AnnotationType=="String"}
101 // 7. {AnnotationType.feature==string}
102 // 8. {AnnotationType.feature=="string"}
103
104 // Steps
105 // The query would we searched from left to right order
106
107 // returned arraylist contains queries where each query is required
108 // to
109 // be converted into the Phrase query
110 queries = SubQueryParser.parseQuery(query);
111 Query[] q = new Query[queries.size()];
112 for(int i = 0; i < queries.size(); i++) {
113 Query phraseQuery = createPhraseQuery((String)queries.get(i));
114 // if the corpusID is not provided we donot want to create a
115 // boolean query
116 if(corpusID == null && annotationSetToSearchIn == null) {
117 BooleanQuery booleanQuery = new BooleanQuery();
118 Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
119 TermQuery tQuery = new TermQuery(t);
120 booleanQuery.add(tQuery, false, true);
121 booleanQuery.add(phraseQuery, true, false);
122 q[i] = booleanQuery;
123 }
124 else {
125 BooleanQuery booleanQuery = new BooleanQuery();
126 booleanQuery.add(phraseQuery, true, false);
127 if(corpusID != null) {
128 Term t = new Term(Constants.CORPUS_ID, corpusID);
129 TermQuery tQuery = new TermQuery(t);
130 booleanQuery.add(tQuery, true, false);
131 }
132
133 if(annotationSetToSearchIn != null) {
134 Term t = new Term(Constants.ANNOTATION_SET_ID, annotationSetToSearchIn);
135 TermQuery tQuery = new TermQuery(t);
136 booleanQuery.add(tQuery, true, false);
137 } else {
138 Term t = new Term(Constants.ANNOTATION_SET_ID, Constants.COMBINED_SET);
139 TermQuery tQuery = new TermQuery(t);
140 booleanQuery.add(tQuery, false, true);
141 }
142
143
144 q[i] = booleanQuery;
145 }
146 }
147 return q;
148 }
149
150 /**
151 * When user submits an ANNIC query, one or more instances of lucene
152 * queries are created and returned. This method returns the string
153 * representation of the query at the given index.
154 *
155 * @param i
156 * @return
157 */
158 public String getQueryString(int i) {
159 return (String)queries.get(i);
160 }
161
162 /**
163 * This method will create each normalized query into a Phrase or Term
164 * query If the query has only one term to search, it will be returned
165 * as a TermQuery otherwise, it will be returned as the PhraseQuery
166 */
167 private Query createPhraseQuery(String query)
168 throws gate.creole.ir.SearchException {
169 // Here we play the actual trick with lucene
170 // For a query like {Lookup}{Token}{Person.gender=="male"}
171 // internally this query is converted into the following PhraseQuery
172 // (Lookup Token Person male)
173 // these are the four terms which will be searched and they should
174 // occur
175 // in this order only
176 // but what we need is
177 // a pattern where
178 // Lookup -> the first annotation is of type Lookup
179 // Token -> the second annotation type is Token
180 // Person male -> and the third annotation must have a type person
181 // and a
182 // feature gender with male
183 // that means Person and male should be considered at the same
184 // location
185 // By default lucene doesn't do this and look for a position that is
186 // 1
187 // step more than the previous one
188 // so it will search for the first position of Lookup
189 // let say it is 19 (i.e. 19th annotation in the document)
190 // then it would consider 20th location for Token
191 // 21st for Person
192 // 22nd for male
193 // but we need, 19th for Lookup, 20th for Token and 21st for both
194 // Person
195 // and Male
196 // so from here itself we send our choice for the Location of
197 // annotations in this termPositions array :-).
198 // isn't it a great crack?
199 position = 0;
200
201 PhraseQuery phQuery = new PhraseQuery();
202 // we will tokenize this query to convert it into different tokens
203 // query is like {Person}"said" "Hello" {Person.gender=="male"}
204 // we need to convert this into different tokens
205 // {Person}
206 // "said"
207 // "Hello"
208 // {Person.gender=="male"}
209 List<String> tokens = findTokens(query);
210
211 // and then convert each token into separate terms
212 if(tokens.size() == 1) {
213 ArrayList[] termsPos = (createTerms(tokens.get(0)));
214 ArrayList terms = termsPos[0];
215 if(terms.size() == 1) {
216 if(areAllTermsTokens)
217 needValidation = false;
218 else needValidation = true;
219 return new TermQuery((Term)terms.get(0));
220 }
221 else {
222 position = 0;
223 }
224 }
225
226 int totalTerms = 0;
227 boolean hadPreviousTermsAToken = true;
228
229 needValidation = false;
230
231 // and now for each token we need to create Term(s)
232 outer: for(int i = 0; i < tokens.size(); i++) {
233 ArrayList[] termpositions = createTerms((String)tokens.get(i));
234 ArrayList terms = termpositions[0];
235 ArrayList pos = termpositions[1];
236 ArrayList consider = termpositions[2];
237
238 boolean allTermsTokens = true;
239 // lets first find out if there's any token in this terms
240 for(int k = 0; k < terms.size(); k++) {
241 Term t = (Term)terms.get(k);
242
243 if(allTermsTokens) allTermsTokens = isBaseTokenTerm(t);
244 }
245
246 if(!hadPreviousTermsAToken) {
247 needValidation = true;
248 break;
249 }
250
251 if(!allTermsTokens) {
252 // we want to break here
253 needValidation = true;
254 if(i > 0)
255 break outer;
256 }
257
258 for(int k = 0; k < terms.size(); k++) {
259 Term t = (Term)terms.get(k);
260 boolean considerValue = ((Boolean)consider.get(k)).booleanValue();
261 phQuery.add(t, (Integer)pos.get(k), considerValue);
262 if(considerValue) totalTerms++;
263 }
264
265 hadPreviousTermsAToken = allTermsTokens;
266 }
267 phQuery.setTotalTerms(totalTerms);
268 return phQuery;
269 }
270
271 /**
272 * Returns true if the provided Term is a based token term. To be a
273 * base token term it has to satisify the following terms: 1. If its
274 * text is baseTokenAnnotationType and the type is "*" or 2. If its
275 * type = "baseTokenAnnotationType.feature"
276 *
277 * @param t
278 * @return
279 */
280 private boolean isBaseTokenTerm(Term t) {
281 // the term refers to the base token
282 // only if it satisfies the following conditions
283 // 1. If its text is baseTokenAnnotationType and the type is "*"
284 // or 2. If its type = "baseTokenAnnotationType.feature"
285
286 // condition 1
287 if(t.text().equals(baseTokenAnnotationType) && t.type().equals("*"))
288 return true;
289
290 // condition 2
291 if(t.type().startsWith(baseTokenAnnotationType + ".")) return true;
292
293 return false;
294 }
295
296 public int position = 0;
297
298 /**
299 * Given a query this method returns tokens. Here token is an object
300 * of string.
301 *
302 * @param query
303 * @return
304 * @throws gate.creole.ir.SearchException
305 */
306 public List<String> findTokens(String query)
307 throws gate.creole.ir.SearchException {
308 List<String> tokens = new ArrayList<String>();
309 String token = "";
310 char ch = ' ';
311 char prev = ' ';
312 int balance = 0;
313 for(int i = 0; i < query.length(); i++) {
314 prev = ch;
315 ch = query.charAt(i);
316 if(isOpeneningBrace(ch, prev)) {
317 if(balance != 0) {
318 throw new SearchException("unbalanced braces",
319 "a closing brace (}) is missing before this opening brace", query, i);
320 }
321
322 if(!token.trim().equals("")) {
323 tokens.add(token.trim());
324 }
325
326 balance++;
327 token = "{";
328 continue;
329 }
330
331 if(isClosingBrace(ch, prev)) {
332 balance--;
333 if(balance != 0) {
334 throw new SearchException("unbalanced braces",
335 "an opening brace ({) is missing before this closing brace", query, i);
336 }
337
338 token += "}";
339 tokens.add(token.trim());
340 token = "";
341 continue;
342 }
343
344 token += (char)ch;
345 }
346
347 if(balance != 0) {
348 if (balance > 0) {
349 throw new SearchException("unbalanced braces",
350 "One closing brace (}) is missing in this expression", query);
351 } else {
352 throw new SearchException("unbalanced braces",
353 "One opening brace ({) is missing in this expression", query);
354 }
355 }
356
357 if(!token.trim().equals("")) tokens.add(token);
358
359 return tokens;
360 }
361
362 private boolean isOpeneningBrace(char ch, char pre) {
363 if(ch == '{' && pre != '\\')
364 return true;
365 else return false;
366 }
367
368 private boolean isClosingBrace(char ch, char pre) {
369 if(ch == '}' && pre != '\\')
370 return true;
371 else return false;
372 }
373
374 boolean areAllTermsTokens = false;
375
376 private boolean isEscapeSequence(String element, int index) {
377 if(index > 0) {
378 return element.charAt(index - 1) == '\\';
379 }
380 return false;
381 }
382
383 private ArrayList splitString(String string, char with, boolean normalize) {
384 // here we want to split the string
385 // but also make sure the with character is not escaped
386 ArrayList strings = new ArrayList();
387 StringBuffer newString = new StringBuffer();
388 for(int i = 0; i < string.length(); i++) {
389 if(i == 0) {
390 newString.append(string.charAt(0));
391 continue;
392 }
393
394 if(string.charAt(i) == with) {
395 // need to check the previous character
396 if(string.charAt(i - 1) == '\\') {
397 newString.append(with);
398 continue;
399 }
400 else {
401 if(normalize)
402 strings.add(norm(newString.toString()));
403 else strings.add(newString.toString());
404
405 newString = new StringBuffer();
406 continue;
407 }
408 }
409
410 newString.append(string.charAt(i));
411 }
412 if(newString.length() > 0) {
413 if(normalize)
414 strings.add(norm(newString.toString()).trim());
415 else strings.add(newString.toString().trim());
416 }
417 return strings;
418 }
419
420 private int findIndexOf(String element, char ch) {
421 int index1 = -1;
422 int start = -1;
423 while(true) {
424 index1 = element.indexOf(ch, start);
425 if(isEscapeSequence(element, index1)) {
426 start = index1 + 1;
427 }
428 else {
429 break;
430 }
431 }
432 return index1;
433 }
434
435 private String norm(String string) {
436 StringBuffer sb = new StringBuffer();
437 for(int i = 0; i < string.length(); i++) {
438 if(string.charAt(i) == '\\') {
439 if(i + 1 <= string.length() - 1) {
440 char ch = string.charAt(i + 1);
441 if(ch == ',' || ch == '.' || ch == '(' || ch == ')' || ch == '{'
442 || ch == '}' || ch == '"')
443 ;
444 continue;
445 }
446 }
447 sb.append(string.charAt(i));
448 }
449 return sb.toString();
450 }
451
452 public ArrayList[] createTerms(String elem)
453 throws gate.creole.ir.SearchException {
454 areAllTermsTokens = true;
455 ArrayList terms = new ArrayList();
456 ArrayList pos = new ArrayList();
457 ArrayList consider = new ArrayList();
458
459 elem = elem.trim();
460 if(elem.charAt(0) == '{' && elem.charAt(elem.length() - 1) == '}') {
461 // possible
462 elem = elem.substring(1, elem.length() - 1);
463 int index = elem.indexOf("==");
464 int index1 = findIndexOf(elem, '.');
465
466 if(index == -1 && index1 == -1) {
467 // 3. {AnnotationType}
468 // this can be {AnnotationType, AnnotationType...}
469 ArrayList fields = splitString(elem, ',', true);
470
471 for(int p = 0; p < fields.size(); p++) {
472 if(areAllTermsTokens
473 && !((String)fields.get(p)).equals(baseTokenAnnotationType))
474 areAllTermsTokens = false;
475
476 terms.add(new Term(field, norm(((String)fields.get(p))), "*"));
477 pos.add(new Integer(position));
478 if(p == 0)
479 consider.add(new Boolean(true));
480 else consider.add(new Boolean(false));
481
482 }
483 position++;
484 }
485 else if(index != -1 && index1 == -1) {
486 // 4. {AnnotationType==String}
487 // 5. {AnnotationType=="String"}
488
489 ArrayList fields = splitString(elem, ',', false);
490 for(int p = 0; p < fields.size(); p++) {
491 index = ((String)fields.get(p)).indexOf("==");
492 // here this is also posible
493 // {AnnotationType, AnnotationType=="String"}
494 if(index != -1) {
495 String annotType = norm(((String)fields.get(p)).substring(0, index)
496 .trim());
497 String annotText = norm(((String)fields.get(p)).substring(
498 index + 2, ((String)fields.get(p)).length()).trim());
499 if(annotText.length() > 2 && annotText.charAt(0) == '\"'
500 && annotText.charAt(annotText.length() - 1) == '\"') {
501 annotText = annotText.substring(1, annotText.length() - 1);
502 }
503 if(!annotType.trim().equals(baseTokenAnnotationType))
504 areAllTermsTokens = false;
505
506 terms.add(new Term(field, annotText, annotType + ".string"));
507 pos.add(new Integer(position));
508 if(p == 0)
509 consider.add(new Boolean(true));
510 else consider.add(new Boolean(false));
511
512 }
513 else {
514 if(!(norm((String)fields.get(p))).equals(baseTokenAnnotationType))
515 areAllTermsTokens = false;
516
517 terms.add(new Term(field, norm(((String)fields.get(p))), "*"));
518 pos.add(new Integer(position));
519 if(p == 0)
520 consider.add(new Boolean(true));
521 else consider.add(new Boolean(false));
522 }
523 }
524
525 position++;
526
527 }
528 else if(index == -1 && index1 != -1) {
529 throw new SearchException("missing operator",
530 "an equal operator (==) is missing",
531 elem, (elem.indexOf("=", index1)!=-1)?
532 elem.indexOf("=", index1):elem.length());
533 }
534 else if(index != -1 && index1 != -1) {
535
536 // it can be {AT, AT.f==S, AT=="S"}
537 int index2 = findIndexOf(elem, ',');
538 String[] subElems = null;
539 if(index2 == -1) {
540 subElems = new String[] {elem};
541 }
542 else {
543 ArrayList list = splitString(elem, ',', false);
544 subElems = new String[list.size()];
545 for(int k = 0; k < list.size(); k++) {
546 subElems[k] = (String)list.get(k);
547 }
548 }
549
550 int lengthTravelledSoFar = 0;
551 for(int j = 0; j < subElems.length; j++) {
552 // 7. {AnnotationType.feature==string}
553 // 8. {AnnotationType.feature=="string"}
554 index = subElems[j].indexOf("==");
555 index1 = findIndexOf(subElems[j], '.');
556 if(index == -1 && index1 == -1) {
557 // this is {AT}
558 if(!norm(subElems[j].trim()).equals(baseTokenAnnotationType))
559 areAllTermsTokens = false;
560 terms.add(new Term(field, norm(subElems[j].trim()), "*"));
561 pos.add(new Integer(position));
562 if(j == 0)
563 consider.add(new Boolean(true));
564 else consider.add(new Boolean(false));
565
566 }
567 else if(index != -1 && index1 == -1) {
568 // this is {AT=="String"}
569 String annotType = norm(subElems[j].substring(0, index).trim());
570 String annotText = norm(subElems[j].substring(index + 2,
571 subElems[j].length()).trim());
572 if(annotText.charAt(0) == '\"'
573 && annotText.charAt(annotText.length() - 1) == '\"') {
574 annotText = annotText.substring(1, annotText.length() - 1);
575 }
576 if(!annotType.trim().equals(baseTokenAnnotationType))
577 areAllTermsTokens = false;
578 terms.add(new Term(field, annotText, annotType + ".string"));
579 pos.add(new Integer(position));
580 if(j == 0)
581 consider.add(new Boolean(true));
582 else consider.add(new Boolean(false));
583
584 }
585 else if(index == -1 && index1 != -1) {
586 throw new SearchException("missing operator",
587 "an equal operator (==) is missing",
588 elem, (elem.indexOf("=", lengthTravelledSoFar)!=-1)?
589 elem.indexOf("=", lengthTravelledSoFar):elem.length());
590 }
591 else {
592 // this is {AT.f == "s"}
593 String annotType = norm(subElems[j].substring(0, index1).trim());
594 String featureType = norm(subElems[j].substring(index1 + 1, index)
595 .trim());
596 String featureText = norm(subElems[j].substring(index + 2,
597 subElems[j].length()).trim());
598 if(featureText.length() > 2 && featureText.charAt(0) == '\"'
599 && featureText.charAt(featureText.length() - 1) == '\"')
600 featureText = featureText.substring(1, featureText.length() - 1);
601
602 if(!annotType.trim().equals(baseTokenAnnotationType))
603 areAllTermsTokens = false;
604 terms.add(new Term(field, featureText, annotType + "."
605 + featureType));
606 pos.add(new Integer(position));
607 if(j == 0)
608 consider.add(new Boolean(true));
609 else consider.add(new Boolean(false));
610 }
611 lengthTravelledSoFar += subElems[j].length() + 1;
612 }
613 position++;
614 }
615 }
616 else {
617 // possible
618 // remove all the inverted commas
619 String newString = "";
620 char prev = ' ', ch = ' ';
621 for(int i = 0; i < elem.length(); i++) {
622 prev = ch;
623 ch = elem.charAt(i);
624 if(ch == '\"' && prev != '\\') {
625 continue;
626 }
627 else {
628 newString += ch;
629 }
630 }
631 // there can be many tokens
632 String[] subTokens = norm(newString).split("( )+");
633 for(int k = 0; k < subTokens.length; k++) {
634 if(subTokens[k].trim().length() > 0) {
635 terms.add(new Term(field, norm(subTokens[k]), baseTokenAnnotationType
636 + ".string"));
637 pos.add(new Integer(position));
638 consider.add(new Boolean(true));
639 position++;
640 }
641 }
642 }
643 return new ArrayList[] {terms, pos, consider};
644 }
645
646 public boolean needValidation() {
647 return needValidation;
648 }
649 }
|