001 package gate.creole.morph;
002
003 import gate.creole.ResourceInstantiationException;
004
005 import java.io.BufferedReader;
006 import java.io.InputStreamReader;
007 import java.util.ArrayList;
008 import java.util.Iterator;
009
010 public class PatternParser {
011
012 public static void main(String[] args) {
013 try {
014
015 BufferedReader in = new BufferedReader(new InputStreamReader(
016 System.in));
017 variableDeclarationCommand("A ==> [abcdefghijklmnopqrstuvwxyz0123456789-]");
018 variableDeclarationCommand("V ==> [aeiou]");
019 variableDeclarationCommand("VI ==> [aeiouy]");
020 variableDeclarationCommand("C ==> [bcdfghjklmnpqrstvwxyz]");
021 variableDeclarationCommand("CX ==> [bcdfghjklmnpqrstvwxz]");
022 variableDeclarationCommand("CX2 ==> \"bb\" OR \"cc\" OR \"dd\" OR \"ff\" OR \"gg\" OR \"hh\" OR \"jj\" OR \"kk\" OR \"ll\" OR \"mm\" OR \"nn\" OR \"pp\" OR \"qq\" OR \"rr\" OR \"ss\" OR \"tt\" OR \"vv\" OR \"ww\" OR \"xx\" OR \"zz\"");
023 variableDeclarationCommand("CX2S ==> \"ff\" OR \"ss\" OR \"zz\"");
024 variableDeclarationCommand("S ==> \"s\" OR \"x\" OR \"ch\" OR \"sh\"");
025 variableDeclarationCommand("PRE ==> \"be\" OR \"ex\" OR \"in\" OR \"mis\" OR \"pre\" OR \"pro\" OR \"re\"");
026 variableDeclarationCommand("EDING ==> \"ed\" OR \"ing\"");
027 variableDeclarationCommand("ESEDING ==> \"es\" OR \"ed\" OR \"ing\"");
028
029 while (true) {
030 System.out.print("Query: ");
031 String line = in.readLine();
032 if (line.length() == -1)
033 break;
034
035 getPattern(line);
036
037 }
038 } catch (Exception e) {
039 e.printStackTrace();
040 }
041
042 }
043
044 public static void getPattern(String line) {
045 String[] ruleParts = line.split("==>");
046 // now check if the method which has been called in this rule actually
047 // available in the MorphFunction Class
048 String methodCalled = ruleParts[1].trim();
049
050 // so RHS part is Ok
051 // now we need to check if LHS is written properly
052 // and convert it to the pattern that is recognized by the java
053 String category = "";
054 // we need to find out the category
055 int i = 1;
056 for (; i < ruleParts[0].length(); i++) {
057 if (ruleParts[0].charAt(i) == '>')
058 break;
059 category = category + ruleParts[0].charAt(i);
060 }
061
062 RHS rhs = new RHS(ruleParts[1], category);
063 ruleParts[0] = ruleParts[0].substring(i + 1, ruleParts[0].length()).trim();
064 String regExp = ParsingFunctions.convertToRegExp(ruleParts[0], variables);
065 String[] rules = ParsingFunctions.normlizePattern(regExp);
066 for (int m = 0; m < rules.length; m++) {
067 PatternPart parts[] = ParsingFunctions.getPatternParts(rules[m].trim());
068 // each part has a type associated with it
069 for (int j = 0; j < parts.length; j++) {
070 System.out.println(parts[j].getPartString() + "=>"
071 + parts[j].getType());
072 }
073 }
074 }
075
076 public static Storage variables = new Storage();
077
078 private static void variableDeclarationCommand(String line)
079 throws ResourceInstantiationException {
080 // ok so first find the variable name and the value for it
081 String varName = (line.split("==>"))[0].trim();
082 String varValue = (line.split("==>"))[1].trim();
083
084 // find the type of variable it is
085 int valueType = ParsingFunctions.findVariableType(varValue.trim());
086
087 // based on the variable type create the instance
088 Variable varInst = null;
089 switch (valueType) {
090 case Codes.CHARACTER_RANGE_CODE:
091 varInst = new CharacterRange();
092 break;
093 case Codes.CHARACTER_SET_CODE:
094 varInst = new CharacterSet();
095 break;
096 case Codes.STRING_SET_CODE:
097 varInst = new StringSet();
098 break;
099 }
100
101 // set the values in the variable
102 if (!varInst.set(varName, varValue)) {
103 }
104
105 // and finally add the variable in
106 if (!variables.add(varName, varInst.getPattern())) {
107 }
108
109 varInst.resetPointer();
110 }
111
112 public static ArrayList parsePattern(String q1) {
113
114 // arraylist to return - will contain all the OR normalized queries
115 ArrayList patterns = new ArrayList();
116
117 // remove all extra spaces from the query
118 q1 = q1.trim();
119
120 // we add opening and closing brackets explicitly
121 q1 = "( " + q1 + " )";
122
123 // add the main Query in the arraylist
124 patterns.add(q1);
125
126 for (int index = 0; index < patterns.size(); index++) {
127 // get the query to be parsed
128 String query = (String) patterns.get(index);
129
130 // current character and the previous character
131 char ch = ' ', pre = ' ';
132
133 // if query is ORed
134 // we need duplication
135 // for example: {A}((B)|(C))
136 // the normalized form will be
137 // (A)(B)
138 // (A)(C)
139 // here we need (A) to be duplicated two times
140 boolean duplicated = false;
141 int dupliSize = 0;
142 int lastBrClose = 0;
143 String data = "";
144
145 // we need to look into one query at a time and parse it
146 for (int i = 0; i < query.length(); i++) {
147 pre = ch;
148 ch = query.charAt(i);
149
150 // check if it is an open bracket
151 // it is if it doesn't follow the '\' escape sequence
152 if (isOpenBracket(ch, pre)) {
153
154 // so find out where it gets closed
155 int brClPos = findBracketClosingPosition(i + 1, query);
156
157 // see if there are any OR operators in it
158 ArrayList orTokens = findOrTokens(query.substring(i + 1,
159 brClPos));
160
161 // orTokens will have
162 // for eg. {A} | ({B}{C})
163 // then {A}
164 // and ({B}{C})
165 // so basically findOrTokens find out all the tokens around
166 // | operator
167 if (orTokens.size() > 1) {
168 String text = "";
169
170 // data contains all the buffered character before the
171 // current positions
172 // for example "ABC" ({B} | {C})
173 // here "ABC" will be in data
174 // and {B} and {C} in orTokens
175 if (!duplicated && data.length() > 0) {
176 text = data;
177 data = "";
178 } else {
179 if (index == patterns.size() - 1) {
180 // this is the case where we would select the
181 // text as ""
182 text = "";
183 } else {
184 text = (String) patterns
185 .get(patterns.size() - 1);
186 }
187 }
188
189 // so we need to duplicate the text orTokens.size()
190 // times
191 // for example "ABC" ({B} | {C})
192 // text = "ABC"
193 // orTokens {B} {C}
194 // so two queries will be added
195 // 1. "ABC"
196 // 2. "ABC"
197
198 patterns = duplicate(patterns, text, dupliSize,
199 orTokens.size());
200 // and tokens will be added
201 // 1. "ABC" {B}
202 // 2. "ABC" {C}
203 patterns = writeTokens(orTokens, patterns, dupliSize);
204
205 // text is duplicated so make it true
206 duplicated = true;
207
208 // and how many times it was duplicated
209 if (dupliSize == 0)
210 dupliSize = 1;
211 dupliSize *= orTokens.size();
212 } else {
213 // what if the there is only one element between ( and )
214 // it is not an 'OR' query
215
216 // check how many times we have duplicated the text
217 if (dupliSize == 0) {
218 // if zero and the text buffered is ""
219 // we simply add "" as a separate Query
220 // otherwise add the buffered data as a separate
221 // Query
222 if (data.length() == 0)
223 patterns.add("");
224 else
225 patterns.add(data);
226
227 // because we simply needs to add it only once
228 // but still we have copied it as a separate query
229 // so say duplicated = true
230 duplicated = true;
231 data = "";
232 // and ofcourse the size of the duplication will be
233 // only 1
234 dupliSize = 1;
235 }
236 // and we need to add all the contents between two
237 // brackets in the last duplicated
238 // queries
239 patterns = writeStringInAll("<"
240 + query.substring(i + 1, brClPos) + ">",
241 dupliSize, patterns);
242 }
243 i = brClPos;
244 } else {
245 if (duplicated) {
246 patterns = writeCharInAll(ch, dupliSize, patterns);
247 } else {
248 data += "" + ch;
249 }
250 }
251 }
252
253 boolean scan = scanQueryForOrOrBracket(query);
254 if (scan) {
255 patterns.remove(index);
256 index--;
257 }
258 }
259
260 ArrayList queriesToReturn = new ArrayList();
261 for (int i = 0; i < patterns.size(); i++) {
262 String q = (String) patterns.get(i);
263 if (q.trim().length() == 0) {
264 continue;
265 } else if (queriesToReturn.contains(q.trim())) {
266 continue;
267 } else {
268 queriesToReturn.add(q.trim());
269 }
270 }
271
272 for (int i = 0; i < queriesToReturn.size(); i++) {
273 String s = (String) queriesToReturn.get(i);
274 s = s.replaceAll("<", "(");
275 s = s.replaceAll(">", ")");
276 s = s.substring(1, s.length() - 1);
277 queriesToReturn.set(i, s.trim());
278 }
279 return queriesToReturn;
280 }
281
282 public static boolean scanQueryForOrOrBracket(String query) {
283 int index = 0;
284 int index1 = 0;
285 do {
286 index = query.indexOf('|', index);
287 if (index == 0) {
288 return true;
289 } else if (index > 0) {
290 // we have found it but we need to check if it is an escape
291 // sequence
292 if (query.charAt(index - 1) == '\\') {
293 // yes it is an escape sequence
294 // lets search for the next one
295 } else {
296 return true;
297 }
298 }
299
300 // if we are here that means it was not found
301 index1 = query.indexOf('(', index1);
302 if (index1 == 0) {
303 return true;
304 } else if (index1 > 0) {
305 // we have found it
306 if (query.charAt(index1 - 1) == '\\') {
307 // yes it is an escape sequence
308 continue;
309 } else {
310 return true;
311 }
312 }
313
314 } while (index >= 0 && index1 >= 0);
315
316 return false;
317 }
318
319 public static ArrayList writeTokens(ArrayList tokens, ArrayList queries,
320 int dupliSize) {
321 if (dupliSize == 0)
322 dupliSize = 1;
323
324 ArrayList qToRemove = new ArrayList();
325 for (int j = 0; j < dupliSize; j++) {
326 for (int i = 1; i <= tokens.size(); i++) {
327 String token = (String) tokens.get(i - 1);
328 if (token.trim().equals("{__o__}")) {
329 token = " ";
330 }
331 String s = (String) queries.get(queries.size()
332 - (j * tokens.size() + i));
333 qToRemove.add(s);
334 s += token;
335 queries.set(queries.size() - (j * tokens.size() + i), s);
336 }
337 }
338
339 // and now remove
340 for (int i = 0; i < qToRemove.size(); i++) {
341 queries.remove(qToRemove.get(i));
342 }
343
344 return queries;
345 }
346
347 public static ArrayList duplicate(ArrayList queries, String s,
348 int dupliSize, int no) {
349 if (s == null)
350 s = "";
351
352 ArrayList strings = new ArrayList();
353 if (dupliSize == 0) {
354 strings.add(s);
355 } else {
356 for (int i = 0; i < dupliSize; i++) {
357 strings.add(queries.get(queries.size() - (i + 1)));
358 }
359 }
360
361 for (int i = 0; i < strings.size(); i++) {
362 for (int j = 0; j < no; j++) {
363 queries.add(strings.get(i));
364 }
365 }
366
367 return queries;
368 }
369
370 public static ArrayList findOrTokens(String query) {
371 int balance = 0;
372 char pre = ' ';
373 char ch = ' ';
374 ArrayList ors = new ArrayList();
375
376 String s = "";
377 for (int i = 0; i < query.length(); i++) {
378 pre = ch;
379 ch = query.charAt(i);
380 if (isOpenBracket(ch, pre)) {
381 balance++;
382 s += "" + ch;
383 continue;
384 }
385
386 if (isClosingBracket(ch, pre) && balance > 0) {
387 balance--;
388 s += "" + ch;
389 continue;
390 }
391
392 if (isOrSym(ch, pre)) {
393 if (balance > 0) {
394 s += "" + ch;
395 continue;
396 } else {
397 ors.add(s);
398 s = "";
399 continue;
400 }
401 }
402
403 s += "" + ch;
404 }
405
406 if (s.length() > 0)
407 ors.add(s);
408
409 return ors;
410 }
411
412 public static int findBracketClosingPosition(int startFrom, String query) {
413 int balance = 0;
414 char pre = ' ';
415 char ch = ' ';
416 for (int i = startFrom; i < query.length(); i++) {
417 pre = ch;
418 ch = query.charAt(i);
419 if (isOpenBracket(ch, pre)) {
420 balance++;
421 continue;
422 }
423
424 if (isClosingBracket(ch, pre)) {
425 if (balance > 0) {
426 balance--;
427 } else {
428 return i;
429 }
430 }
431 }
432 return -1;
433 }
434
435 public static ArrayList writeCharInAll(char c, int no, ArrayList queries) {
436 for (int i = 0; i < no; i++) {
437 String s = (String) queries.get(queries.size() - (i + 1));
438 s += "" + c;
439 queries.set(queries.size() - (i + 1), s);
440 }
441 return queries;
442 }
443
444 public static ArrayList writeStringInAll(String c, int no, ArrayList queries) {
445 for (int i = 0; i < no; i++) {
446 String s = (String) queries.get(queries.size() - (i + 1));
447 s += "" + c;
448 queries.set(queries.size() - (i + 1), s);
449 }
450 return queries;
451 }
452
453 public static boolean isOpenBracket(char ch, char pre) {
454 if (ch == '(' && pre != '\\')
455 return true;
456 else
457 return false;
458 }
459
460 public static boolean isClosingBracket(char ch, char pre) {
461 if (ch == ')' && pre != '\\')
462 return true;
463 else
464 return false;
465 }
466
467 public static boolean isOrSym(char ch, char pre) {
468 if (ch == '|' && pre != '\\')
469 return true;
470 else
471 return false;
472 }
473
474 }
|