001 /*
002 * SubQueryParser.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: SubQueryParser.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic.lucene;
009
010 import java.io.*;
011 import java.util.*;
012
013 import gate.creole.ir.SearchException;
014
015 /**
016 * This class behaves as a helper class to the QueryParser and provides
017 * various methods which are called from various methods of QueryParser.
018 *
019 * @author niraj
020 */
021 public class SubQueryParser {
022
023 public static void main(String[] args) {
024 try {
025
026 BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
027 while(true) {
028 System.out.print("Query: ");
029 String line = in.readLine();
030
031 if(line.length() == -1) break;
032
033 ArrayList<String> queries = parseQuery(line);
034 for(int i = 0; i < queries.size(); i++) {
035 System.out.println("=>" + queries.get(i));
036 }
037 }
038 }
039 catch(Exception e) {
040 e.printStackTrace();
041 }
042
043 }
044
045 /**
046 * Method retrieves wild card characters after the closing bracket.
047 *
048 * @param brClPos
049 * @param query
050 * @return
051 */
052 private static String findWildCardString(int brClPos, String query) {
053 String wcs = "";
054 if(brClPos + 1 < query.length()) {
055 if(query.charAt(brClPos + 1) == '*' || query.charAt(brClPos + 1) == '+' || query.charAt(brClPos + 1) == '?') {
056 wcs = query.charAt(brClPos + 1) + "";
057 // ok so lets fetch the number
058 for(int i = brClPos + 2; i < query.length(); i++) {
059 if(Character.isDigit(query.charAt(i))) {
060 wcs += query.charAt(i);
061 }
062 else {
063 break;
064 }
065 }
066 }
067 }
068 return wcs;
069 }
070
071 /**
072 * This method, interprets the wild cards and convert query
073 * accordingly. For example: (A)+3 is converted into ((A) | ((A)(A)) |
074 * ((A)(A)(A)))
075 *
076 * @param query
077 * @return
078 * @throws SearchException
079 */
080 private static String extractWildcards(String query) throws SearchException {
081 outer: while(true) {
082 char ch = ' ', pre = ' ';
083 for(int i = 0; i < query.length(); i++) {
084 pre = ch;
085 ch = query.charAt(i);
086
087 // check if it is an open bracket
088 // it is if it doesn't follow the '\' escape sequence
089 if(isOpenBracket(ch, pre)) {
090
091 // so find out where it gets closed
092 int brClPos = findBracketClosingPosition(i + 1, query);
093 if(brClPos == -1) {
094 throw new SearchException("unbalanced brackets",
095 "a closing bracket ()) is missing for this opening bracket", query, i);
096 }
097
098 String wildCardString = findWildCardString(brClPos, query);
099 int wcsLen = 0;
100 boolean atLeastOne = false;
101
102 // at least once
103 int repeatClause = 1;
104
105 if(wildCardString.length() != 0) {
106 if(wildCardString.length() == 1) {
107 // if there is only wildcard char sign
108 // we consider it as 1
109 wcsLen = 1;
110 }
111 else {
112 atLeastOne = (wildCardString.charAt(0) == '*' || wildCardString.charAt(0) == '?') ? false : true;
113 // now find out the number of Times we need to
114 // duplicate the bracketClause
115 repeatClause = Integer.parseInt(wildCardString.substring(1,
116 wildCardString.length()));
117 wcsLen = wildCardString.length();
118 }
119
120 String previous = query.substring(0, i);
121 String after = query
122 .substring(brClPos + wcsLen + 1, query.length());
123 String sToRepeat = query.substring(i, brClPos + 1);
124
125 String newString = "(";
126 for(int k = 1; k <= repeatClause; k++) {
127 newString += "(";
128 for(int subK = 0; subK < k; subK++) {
129 newString += sToRepeat;
130 }
131 newString += ")";
132 if(k + 1 <= repeatClause) {
133 newString += " | ";
134 }
135 }
136
137 if(!atLeastOne) {
138 newString += "| {__o__}";
139 }
140
141 newString += ")";
142 query = previous + newString + after;
143 continue outer;
144 }
145 }
146 }
147
148 // if we are here
149 // that means no whildcard left
150 return query;
151 }
152 }
153
154 /**
155 * this method parses the query and returns the different queries
156 * converted into the OR normalized form
157 * for e.g. ({A}|{B}){C}
158 * this will be converted into ({A}{C}) | ({B}{C})
159 * and the arrayList consists of
160 * 1. {A}{C}
161 * 2. {B}{C}
162 *
163 * @param q1
164 * @return
165 * @throws SearchException
166 */
167 public static ArrayList<String> parseQuery(String q1) throws SearchException {
168
169 // arraylist to return - will contain all the OR normalized queries
170 ArrayList<String> queries = new ArrayList<String>();
171
172 // remove all extra spaces from the query
173 q1 = q1.trim();
174
175 // we add opening and closing brackets explicitly
176 q1 = "( " + q1 + " )";
177
178 q1 = extractWildcards(q1);
179 // add the main Query in the arraylist
180 queries.add(q1);
181
182 for(int index = 0; index < queries.size(); index++) {
183 // get the query to be parsed
184 String query = (String)queries.get(index);
185
186 // current character and the previous character
187 char ch = ' ', pre = ' ';
188
189 // if query is ORed
190 // we need duplication
191 // for example: {A}({B}|{C})
192 // the normalized form will be
193 // {A}{B}
194 // {A}{C}
195 // here we need {A} to be duplicated two times
196 boolean duplicated = false;
197 int dupliSize = 0;
198 String data = "";
199
200 // we need to look into one query at a time and parse it
201 for(int i = 0; i < query.length(); i++) {
202 pre = ch;
203 ch = query.charAt(i);
204
205 // check if it is an open bracket
206 // it is if it doesn't follow the '\' escape sequence
207 if(isOpenBracket(ch, pre)) {
208
209 // so find out where it gets closed
210 int brClPos = findBracketClosingPosition(i + 1, query);
211 if(brClPos == -1) {
212 throw new SearchException("unbalanced brackets",
213 "a closing bracket ()) is missing for this opening bracket", query, i);
214 }
215
216 // see if there are any OR operators in it
217 ArrayList<String> orTokens = findOrTokens(query.substring(i + 1, brClPos));
218
219 // orTokens will have
220 // for eg. {A} | ({B}{C})
221 // then {A}
222 // and ({B}{C})
223 // so basically findOrTokens find out all the tokens around
224 // | operator
225 if(orTokens.size() > 1) {
226 String text = "";
227
228 // data contains all the buffered character before the
229 // current positions
230 // for example "ABC" ({B} | {C})
231 // here "ABC" will be in data
232 // and {B} and {C} in orTokens
233 if(!duplicated && data.length() > 0) {
234 text = data;
235 data = "";
236 }
237 else {
238 if(index == queries.size() - 1) {
239 // this is the case where we would select the
240 // text as ""
241 text = "";
242 }
243 else {
244 text = (String)queries.get(queries.size() - 1);
245 }
246 }
247
248 // so we need to duplicate the text orTokens.size()
249 // times
250 // for example "ABC" ({B} | {C})
251 // text = "ABC"
252 // orTokens {B} {C}
253 // so two queries will be added
254 // 1. "ABC"
255 // 2. "ABC"
256
257 queries = duplicate(queries, text, dupliSize, orTokens.size());
258 // and tokens will be added
259 // 1. "ABC" {B}
260 // 2. "ABC" {C}
261 queries = writeTokens(orTokens, queries, dupliSize);
262
263 // text is duplicated so make it true
264 duplicated = true;
265
266 // and how many times it was duplicated
267 if(dupliSize == 0) dupliSize = 1;
268 dupliSize *= orTokens.size();
269 }
270 else {
271 // what if the there is only one element between ( and )
272 // it is not an 'OR' query
273
274 // check how many times we have duplicated the text
275 if(dupliSize == 0) {
276 // if zero and the text buffered is ""
277 // we simply add "" as a separate Query
278 // otherwise add the buffered data as a separate
279 // Query
280 if(data.length() == 0)
281 queries.add("");
282 else queries.add(data);
283
284 // because we simply needs to add it only once
285 // but still we have copied it as a separate query
286 // so say duplicated = true
287 duplicated = true;
288 data = "";
289 // and ofcourse the size of the duplication will be
290 // only 1
291 dupliSize = 1;
292 }
293 // and we need to add all the contents between two
294 // brackets in the last duplicated
295 // queries
296 queries = writeStringInAll(query.substring(i + 1, brClPos),
297 dupliSize, queries);
298 }
299 i = brClPos;
300 }
301 else if(isClosingBracket(ch, pre)) {
302 throw new SearchException("unbalanced brackets",
303 "a opening bracket (() is missing for this closing bracket", query, i);
304 }
305 else {
306 if(duplicated) {
307 queries = writeCharInAll(ch, dupliSize, queries);
308 }
309 else {
310 data += "" + ch;
311 }
312 }
313 }
314
315 boolean scan = scanQueryForOrOrBracket(query);
316 if(scan) {
317 queries.remove(index);
318 index--;
319 }
320 }
321
322 ArrayList<String> queriesToReturn = new ArrayList<String>();
323 for(int i = 0; i < queries.size(); i++) {
324 String q = queries.get(i);
325 if(q.trim().length() == 0) {
326 continue;
327 }
328 else if(queriesToReturn.contains(q.trim())) {
329 continue;
330 }
331 else {
332 queriesToReturn.add(q.trim());
333 }
334 }
335 return queriesToReturn;
336 }
337
338 /**
339 * This method checks if query has either | or ( in it.
340 * @param query
341 * @return
342 */
343 public static boolean scanQueryForOrOrBracket(String query) {
344 int index = 0;
345 int index1 = 0;
346 do {
347 index = query.indexOf('|', index);
348 if(index == 0) {
349 return true;
350 }
351 else if(index > 0) {
352 // we have found it but we need to check if it is an escape
353 // sequence
354 if(query.charAt(index - 1) == '\\') {
355 // yes it is an escape sequence
356 // lets search for the next one
357 }
358 else {
359 return true;
360 }
361 }
362
363 // if we are here that means it was not found
364 index1 = query.indexOf('(', index1);
365 if(index1 == 0) {
366 return true;
367 }
368 else if(index1 > 0) {
369 // we have found it
370 if(query.charAt(index1 - 1) == '\\') {
371 // yes it is an escape sequence
372 continue;
373 }
374 else {
375 return true;
376 }
377 }
378
379 } while(index >= 0 && index1 >= 0);
380
381 return false;
382 }
383
384 /**
385 * This is a helper method that helps in duplicating the provided tokens.
386 * @param tokens
387 * @param queries
388 * @param dupliSize
389 * @return
390 */
391 private static ArrayList<String> writeTokens(ArrayList<String> tokens, ArrayList<String> queries,
392 int dupliSize) {
393 if(dupliSize == 0) dupliSize = 1;
394
395 ArrayList<String> qToRemove = new ArrayList<String>();
396 for(int j = 0; j < dupliSize; j++) {
397 for(int i = 1; i <= tokens.size(); i++) {
398 String token = tokens.get(i - 1);
399 if(token.trim().equals("{__o__}")) {
400 token = " ";
401 }
402 String s = (String)queries
403 .get(queries.size() - (j * tokens.size() + i));
404 qToRemove.add(s);
405 s += token;
406 queries.set(queries.size() - (j * tokens.size() + i), s);
407 }
408 }
409
410 // and now remove
411 for(int i = 0; i < qToRemove.size(); i++) {
412 queries.remove(qToRemove.get(i));
413 }
414
415 return queries;
416 }
417
418 /**
419 * This is a helper method that helps in duplicating the provided tokens.
420 */
421 private static ArrayList<String> duplicate(ArrayList<String> queries, String s, int dupliSize,
422 int no) {
423 if(s == null) s = "";
424
425 ArrayList<String> strings = new ArrayList<String>();
426 if(dupliSize == 0) {
427 strings.add(s);
428 }
429 else {
430 for(int i = 0; i < dupliSize; i++) {
431 strings.add(queries.get(queries.size() - (i + 1)));
432 }
433 }
434
435 for(int i = 0; i < strings.size(); i++) {
436 for(int j = 0; j < no; j++) {
437 queries.add(strings.get(i));
438 }
439 }
440
441 return queries;
442 }
443
444 /**
445 * This method given a query identifies the OR Tokens
446 * for eg. {A} | ({B}{C})
447 * then {A}
448 * and ({B}{C})
449 * so basically findOrTokens find out all the tokens around
450 * | operator
451 * @param query
452 * @return
453 */
454 public static ArrayList<String> findOrTokens(String query) {
455 int balance = 0;
456 char pre = ' ';
457 char ch = ' ';
458 ArrayList<String> ors = new ArrayList<String>();
459
460 String s = "";
461 for(int i = 0; i < query.length(); i++) {
462 pre = ch;
463 ch = query.charAt(i);
464 if(isOpenBracket(ch, pre)) {
465 balance++;
466 s += "" + ch;
467 continue;
468 }
469
470 if(isClosingBracket(ch, pre) && balance > 0) {
471 balance--;
472 s += "" + ch;
473 continue;
474 }
475
476 if(isOrSym(ch, pre)) {
477 if(balance > 0) {
478 s += "" + ch;
479 continue;
480 }
481 else {
482 ors.add(s);
483 s = "";
484 continue;
485 }
486 }
487
488 s += "" + ch;
489 }
490
491 if(s.length() > 0) ors.add(s);
492
493 return ors;
494 }
495
496 /**
497 * Returns the position of a closing bracket.
498 * @param startFrom
499 * @param query
500 * @return
501 */
502 private static int findBracketClosingPosition(int startFrom, String query) {
503 int balance = 0;
504 char pre = ' ';
505 char ch = ' ';
506 for(int i = startFrom; i < query.length(); i++) {
507 pre = ch;
508 ch = query.charAt(i);
509 if(isOpenBracket(ch, pre)) {
510 balance++;
511 continue;
512 }
513
514 if(isClosingBracket(ch, pre)) {
515 if(balance > 0) {
516 balance--;
517 }
518 else {
519 return i;
520 }
521 }
522 }
523 return -1;
524 }
525
526 /**
527 * Helps in duplicating a character in the provided queries
528 * @param c
529 * @param no
530 * @param queries
531 * @return
532 */
533 private static ArrayList<String> writeCharInAll(char c, int no, ArrayList<String> queries) {
534 for(int i = 0; i < no; i++) {
535 String s = queries.get(queries.size() - (i + 1));
536 s += "" + c;
537 queries.set(queries.size() - (i + 1), s);
538 }
539 return queries;
540 }
541
542 /**
543 * Helps in duplicating a string in the provided queries
544 * @param c
545 * @param no
546 * @param queries
547 * @return
548 */
549 private static ArrayList<String> writeStringInAll(String c, int no, ArrayList<String> queries) {
550 for(int i = 0; i < no; i++) {
551 String s = (String)queries.get(queries.size() - (i + 1));
552 s += "" + c;
553 queries.set(queries.size() - (i + 1), s);
554 }
555 return queries;
556 }
557
558 /**
559 * Returns if the character is bracket used to mark boundary of a token or an escape character.
560 * @param ch
561 * @param pre
562 * @return
563 */
564 private static boolean isOpenBracket(char ch, char pre) {
565 if(ch == '(' && pre != '\\')
566 return true;
567 else return false;
568 }
569
570 /**
571 * Returns if the character is bracket used to mark boundary of a token or an escape character.
572 * @param ch
573 * @param pre
574 * @return
575 */
576 private static boolean isClosingBracket(char ch, char pre) {
577 if(ch == ')' && pre != '\\')
578 return true;
579 else return false;
580 }
581
582 /**
583 * Returns if the character is an OR symbol used as a logical operator or an escape character.
584 * @param ch
585 * @param pre
586 * @return
587 */
588 private static boolean isOrSym(char ch, char pre) {
589 if(ch == '|' && pre != '\\')
590 return true;
591 else return false;
592 }
593
594 }
|