001 /**
002 * (c) Copyright Ontotext Lab, Sirma Group Corp 2004
003 */
004
005 package com.ontotext.gate.gazetteer;
006
007 import gate.AnnotationSet;
008 import gate.Factory;
009 import gate.FeatureMap;
010 import gate.Resource;
011 import gate.creole.ExecutionException;
012 import gate.creole.ResourceInstantiationException;
013 import gate.creole.gazetteer.AbstractGazetteer;
014 import gate.creole.gazetteer.GazetteerException;
015 import gate.creole.gazetteer.GazetteerList;
016 import gate.creole.gazetteer.GazetteerNode;
017 import gate.creole.gazetteer.LinearDefinition;
018 import gate.creole.gazetteer.LinearNode;
019 import gate.creole.gazetteer.Lookup;
020 import gate.creole.gazetteer.MappingNode;
021 import gate.util.InvalidOffsetException;
022 import gate.util.LuckyException;
023
024 import java.util.ArrayList;
025 import java.util.HashMap;
026 import java.util.HashSet;
027 import java.util.Iterator;
028 import java.util.List;
029 import java.util.Map;
030 import java.util.Set;
031
032 public class HashGazetteer extends AbstractGazetteer {
033 private static final long serialVersionUID = -4603155688378104052L;
034
035 private ArrayList<Lookup> categoryList;
036
037 private Map<LinearNode, GazetteerList> listsByNode;
038
039 private Map<String, List<Lookup>> mapsList[];
040
041 private int mapsListSize;
042
043 private AnnotationSet annotationSet = null;
044
045 @SuppressWarnings("unchecked")
046 public Resource init() throws ResourceInstantiationException {
047 System.out.println("HashGazetteer is being initialized!");
048 if(listsURL == null)
049 throw new ResourceInstantiationException(
050 "No URL provided for gazetteer creation!");
051
052 try {
053 mapsList = new HashMap[1000];
054 definition = new LinearDefinition();
055 definition.setURL(listsURL);
056 definition.load();
057 int i = definition.size();
058 listsByNode = definition.loadLists();
059 mapsListSize = mapsList.length;
060 categoryList = new ArrayList<Lookup>(i + 1);
061 Iterator<LinearNode> iterator = definition.iterator();
062 int j = 0;
063 LinearNode linearnode;
064 for(; iterator.hasNext(); readList(linearnode)) {
065 linearnode = (LinearNode)iterator.next();
066 fireStatusChanged("Reading " + linearnode.toString());
067 fireProgressChanged((++j * 100) / i);
068 }
069
070 fireProcessFinished();
071 }
072 catch(Exception exception) {
073 throw new ResourceInstantiationException(exception);
074 }
075 return this;
076 }
077
078 public void execute() throws ExecutionException {
079 if(document == null) throw new ExecutionException("Document is null!");
080 annotationSet = document.getAnnotations(annotationSetName);
081
082 String s = document.getContent().toString() + " ";
083
084 int i = s.length();
085 int j = 0;
086 int k = 0;
087
088 StringBuffer stringbuffer = new StringBuffer();
089 boolean prevIsSymbol = false;
090 boolean prevIsDigit = false;
091 boolean prevIsLetter = false;
092
093 // TODO what does this do, as it is only ever set to false
094 boolean flag11 = false;
095
096 String s3 = "";
097 int i1 = 0;
098 int j1 = 0;
099
100 for(int l1 = 0; l1 < i; l1++) {
101 char c = s.charAt(l1);
102 boolean currIsWhitespace = Character.isWhitespace(c);
103 if(currIsWhitespace && stringbuffer.length() == 0) {
104 j++;
105 prevIsLetter = prevIsDigit = prevIsSymbol = flag11 = false;
106 continue;
107 }
108 if(currIsWhitespace && prevIsSymbol && stringbuffer.length() == 1) {
109 j += 2;
110 prevIsLetter = prevIsDigit = prevIsSymbol = flag11 = false;
111 stringbuffer.delete(0, stringbuffer.length());
112 continue;
113 }
114 boolean currIsLetter = Character.isLetter(c);
115 boolean currIsDigit = Character.isDigit(c);
116 boolean currIsSymbol = !currIsWhitespace && !currIsLetter && !currIsDigit;
117 boolean currIsLowerCase = Character.isLowerCase(c);
118 if(k <= j
119 && (currIsWhitespace || currIsSymbol || flag11
120 && !currIsLowerCase || !prevIsLetter && currIsLetter))
121 k = l1;
122 boolean flag13 = prevIsLetter
123 && (currIsDigit || currIsSymbol || currIsWhitespace)
124 || prevIsLetter && currIsLetter && flag11 && !currIsLowerCase
125 || prevIsDigit
126 && (currIsLetter || currIsSymbol || currIsWhitespace)
127 || prevIsSymbol;
128 if(l1 == i - 1) flag13 = true;
129 if(flag13) {
130 boolean flag16 = !currIsSymbol && !currIsDigit;
131 if(l1 == i - 1) flag16 = true;
132 String s2 = normalizeWhitespace(stringbuffer.toString());
133 int k1 = s2.length();
134 flag16 &= k1 - j1 > 1;
135 j1 = k1;
136 if(i1 != j || !s2.equals(s3)) {
137 int l = s2.length();
138 if(l > 0) {
139 boolean flag14 = annotate(s2, j, l1, l);
140 if(flag14) {
141 s3 = s2;
142 i1 = j;
143 }
144 if(!flag14 && flag16 || i - 1 == l1) {
145 if(k <= j) k = l1;
146 j = k;
147 l1 = k - 1;
148 stringbuffer.delete(0, stringbuffer.length());
149 continue;
150 }
151 }
152 }
153 }
154 stringbuffer.append(c);
155 prevIsDigit = currIsDigit;
156 prevIsLetter = currIsLetter;
157 prevIsSymbol = currIsSymbol;
158 }
159
160 fireProcessFinished();
161 fireStatusChanged("Hash Gazetteer processing finished!");
162 }
163
164 public boolean add(String s, Lookup lookup1) {
165 if(!super.caseSensitive.booleanValue()) {
166 String s1 = s.toUpperCase();
167 if(!s1.equals(s)) add(s1, lookup1);
168 }
169 String s2 = removeTrailingSymbols(s);
170 if(!s2.equals(s)) add(s2, lookup1);
171 String s3 = s + " ";
172
173 List<Lookup> arraylist = null;
174 int j = 0;
175 s3.trim();
176 j = s3.length();
177
178 boolean prevIsLetter = false;
179 boolean prevIsDigit = false;
180 boolean prevIsLowercase = false;
181
182 String s4 = "";
183 Map<String, List<Lookup>> hashmap = null;
184 for(int k = 0; k < j; k++) {
185 char c = s3.charAt(k);
186 boolean currIsWhitespace = Character.isWhitespace(c);
187 boolean currIsDigit = Character.isDigit(c);
188 boolean currIsLetter = Character.isLetter(c);
189 boolean currIsSymbol = !currIsWhitespace && !currIsDigit && !currIsLetter;
190 boolean currIsLowercase = Character.isLowerCase(c);
191 boolean flag18 = prevIsLetter
192 && (currIsDigit || currIsSymbol || currIsWhitespace)
193 || prevIsLetter && currIsLetter && prevIsLowercase
194 && !currIsLowercase || prevIsDigit
195 && (currIsLetter || currIsSymbol || currIsWhitespace);
196 if(k + 1 == j) flag18 = true;
197 if(flag18) {
198 s4 = normalizeWhitespace(s3.substring(0, k));
199 int i = s4.length();
200 if(mapsList[i] == null) {
201 hashmap = new HashMap<String, List<Lookup>>();
202 mapsList[i] = hashmap;
203 }
204 else {
205 hashmap = (Map<String, List<Lookup>>)mapsList[i];
206 }
207 if(!hashmap.containsKey(s4)) hashmap.put(s4, null);
208 }
209 prevIsDigit = currIsDigit;
210 prevIsLetter = currIsLetter;
211
212 prevIsLowercase = currIsLowercase;
213
214 }
215
216 arraylist = hashmap.get(s4);
217 if(null == arraylist) {
218 arraylist = new ArrayList<Lookup>(1);
219 arraylist.add(lookup1);
220 }
221 else if(!arraylist.contains(lookup1)) arraylist.add(lookup1);
222 hashmap.put(s4, arraylist);
223 return true;
224 }
225
226 public Set<Lookup> lookup(String s) {
227 Set<Lookup> set = null;
228 String s1 = normalizeWhitespace(s);
229 int i = s1.length();
230 if(mapsListSize < i) return set;
231 Map<String, List<Lookup>> hashmap = (HashMap<String, List<Lookup>>)mapsList[i];
232 if(hashmap == null) {
233 return set;
234 }
235 else {
236 Set<Lookup> hashset = new HashSet<Lookup>(hashmap.get(s1));
237 return hashset;
238 }
239 }
240
241 private boolean annotate(String s, int i, int j, int k) {
242 if(k >= mapsListSize) return false;
243 Map<String, List<Lookup>> hashmap = mapsList[k];
244 if(hashmap == null) return false;
245 if(!hashmap.containsKey(s)) return false;
246 List<Lookup> arraylist = hashmap.get(s);
247
248 // TODO shouldn't this return false if arraylist is null?
249
250 if(null != arraylist) {
251 for(Iterator<Lookup> iterator = arraylist.iterator(); iterator.hasNext();) {
252 Lookup lookup1 = (Lookup)iterator.next();
253 FeatureMap featuremap = Factory.newFeatureMap();
254 featuremap.put("majorType", lookup1.majorType);
255 if(null != lookup1.oClass && null != lookup1.ontology) {
256 featuremap.put("class", lookup1.oClass);
257 featuremap.put("ontology", lookup1.ontology);
258 }
259 if(null != lookup1.minorType) {
260 featuremap.put("minorType", lookup1.minorType);
261 if(null != lookup1.languages)
262 featuremap.put("language", lookup1.languages);
263 }
264 try {
265 annotationSet.add(new Long(i), new Long(j), "Lookup", featuremap);
266 }
267 catch(InvalidOffsetException invalidoffsetexception) {
268 throw new LuckyException(invalidoffsetexception.toString());
269 }
270 }
271
272 }
273
274 return true;
275 }
276
277 /**
278 * Removes a string from the gazetteer
279 *
280 * @param s the item to remove
281 * @return true if the operation was successful
282 */
283 public boolean remove(String s) {
284
285 String s1 = a(s, true);
286 int i = s1.length();
287 if(i > mapsListSize) return false;
288 Map<String, List<Lookup>> hashmap = mapsList[i];
289 if(hashmap == null) return false;
290 if(hashmap.containsKey(s1)) {
291 hashmap.remove(s1);
292 return true;
293 }
294 return false;
295 }
296
297 /**
298 * Works backwards through the String parameter removing each
299 * character until it encounters a letter, digit, or whitespace at
300 * which point it returns the truncated string.
301 *
302 * @param s the String you wish to remove trailing symbols from
303 * @return the truncated String that now ends in a letter, digit, or
304 * whitespace character
305 */
306 private String removeTrailingSymbols(String s) {
307 for(int i = s.length() - 1; i >= 0; i--) {
308 char c = s.charAt(i);
309 if(!Character.isLetter(c) && !Character.isDigit(c)
310 && !Character.isWhitespace(c))
311 s = s.substring(0, i);
312 else return s;
313 }
314
315 return s;
316 }
317
318 /**
319 * Normalizes the whitespace within the String instance by replacing
320 * any sequence of one or more whitespace characters with a single
321 * space. Not that any leading/trailing whitespace is also removed.
322 *
323 * @param s the String to normalize
324 * @return the normalized String
325 */
326 private String normalizeWhitespace(String s) {
327
328 // this seems to be the same as String.replaceAll("\\s+", " ")
329
330 StringBuffer stringbuffer = new StringBuffer();
331 s = s.trim();
332 char ac[] = s.toCharArray();
333 int i = s.length();
334 boolean prevWasWhitespace = false;
335 for(int j = 0; j < i; j++) {
336 char c = ac[j];
337
338 boolean currIsWhitespace = Character.isWhitespace(c);
339
340 if(currIsWhitespace && !prevWasWhitespace)
341 stringbuffer.append(' ');
342 else if(!currIsWhitespace) stringbuffer.append(c);
343
344 prevWasWhitespace = currIsWhitespace;
345 }
346
347 return stringbuffer.toString();
348 }
349
350 private String a(String s, boolean flag) {
351 StringBuffer stringbuffer = new StringBuffer();
352 boolean flag1 = true;
353 s = s.trim();
354 char ac[] = s.toCharArray();
355 int i = s.length();
356 if(i <= 1) return s;
357 char c = ac[0];
358 stringbuffer.append(c);
359 boolean flag2 = true;
360 boolean prevIsLetter = Character.isLetter(c);
361 boolean prevNotLetterOrDigit = !Character.isLetterOrDigit(c);
362
363 boolean flag10 = true;
364 char c2 = 'p';
365
366 for(int j = 1; j < i; j++) {
367 char c1 = ac[j];
368 boolean currNotLetterOrDigit = !Character.isLetterOrDigit(c1);
369 boolean currIsWhitespace = Character.isWhitespace(c1);
370 boolean currIsLetter = Character.isLetter(c1);
371 boolean currIsDigit = Character.isDigit(c1);
372 if(j > 0 && flag2) {
373 if(prevNotLetterOrDigit && currIsWhitespace) continue;
374 flag2 = prevIsLetter && currNotLetterOrDigit || prevNotLetterOrDigit
375 && currIsLetter;
376 if(currNotLetterOrDigit) {
377 if(c2 == 'p') c2 = c1;
378 flag2 = flag10 = c2 == c1;
379 }
380 if(j > 2 && !flag2 && stringbuffer.length() > 0) {
381 char c3 = stringbuffer.charAt(stringbuffer.length() - 1);
382 stringbuffer.deleteCharAt(stringbuffer.length() - 1);
383 stringbuffer.append(Character.toLowerCase(c3));
384 }
385 }
386 if(currIsLetter || currIsDigit) {
387 if(flag && currIsLetter) flag1 &= Character.isUpperCase(c1);
388 if(!flag10) c1 = Character.toLowerCase(c1);
389 stringbuffer.append(c1);
390 }
391 else if(!flag2) flag10 = false;
392 prevIsLetter = currIsLetter;
393 prevNotLetterOrDigit = currNotLetterOrDigit;
394 }
395
396 String s1 = stringbuffer.toString();
397 if(flag && flag1) s1 = s1.toUpperCase();
398 return s1;
399 }
400
401 private void readList(LinearNode linearnode) throws GazetteerException {
402
403 if(linearnode == null)
404 throw new GazetteerException("LinearNode node is null");
405
406 GazetteerList gazetteerlist = (GazetteerList)listsByNode.get(linearnode);
407 if(gazetteerlist == null)
408 throw new GazetteerException("gazetteer list not found by node");
409
410 String s = linearnode.getList();
411 String majorType = linearnode.getMajorType();
412 String minorType = linearnode.getMinorType();
413 String language = linearnode.getLanguage();
414
415 Lookup lookup1 = new Lookup(s, majorType, minorType, language);
416
417 if(mappingDefinition != null) {
418 MappingNode mappingnode = mappingDefinition.getNodeByList(s);
419 if(null != mappingnode) {
420 lookup1.oClass = mappingnode.getClassID();
421 lookup1.ontology = mappingnode.getOntologyID();
422 }
423 }
424
425 lookup1.list = s;
426 categoryList.add(lookup1);
427
428 @SuppressWarnings("unchecked")
429 Iterator<GazetteerNode> iterator = gazetteerlist.iterator();
430 String s6 = null;
431
432 for(; iterator.hasNext(); add(s6, lookup1)) {
433 String s4 = iterator.next().toString();
434 s4.trim();
435 int i = s4.length();
436 for(int j = 0; j < i; j++) {
437 if(j + 1 != i && !Character.isWhitespace(s4.charAt(j))) continue;
438 if(j + 1 == i) j = i;
439 s6 = s4.substring(0, j).trim();
440 }
441 }
442 }
443 }
|