001 /*
002 * LuceneDocument.java
003 *
004 * Niraj Aswani, 19/March/07
005 *
006 * $Id: LuceneDocument.html,v 1.0 2007/03/19 16:22:01 niraj Exp $
007 */
008 package gate.creole.annic.lucene;
009
010 import java.io.*;
011
012 import gate.annotation.AnnotationSetImpl;
013 import gate.creole.annic.Constants;
014 import gate.creole.annic.apache.lucene.document.Document;
015 import gate.creole.annic.apache.lucene.document.Field;
016 import java.util.ArrayList;
017 import gate.AnnotationSet;
018 import gate.util.Err;
019 import gate.util.GateRuntimeException;
020 import gate.util.InvalidOffsetException;
021 import gate.util.OffsetComparator;
022
023 import java.util.Arrays;
024 import java.util.Collections;
025 import java.util.HashSet;
026 import java.util.List;
027 import java.util.Set;
028
029 import gate.Annotation;
030 import gate.FeatureMap;
031 import gate.creole.annic.apache.lucene.analysis.Token;
032 import java.util.Iterator;
033
034 /**
035 * Given an instance of Gate Document, this class provides a method to convert
036 * it into the format that lucene can understand and can store in its indexes.
037 * This class also stores the tokenStream on the disk in order to retrieve it at
038 * the time of searching
039 *
040 * @author niraj
041 *
042 */
043 public class LuceneDocument {
044
045 /**
046 * Given an instance of Gate Document, it converts it into the format that
047 * lucene can understand and can store in its indexes. This method also stores
048 * the tokenStream on the disk in order to retrieve it at the time of
049 * searching
050 *
051 * @param corpusPersistenceID
052 * @param gateDoc
053 * @param documentID
054 * @param annotSet
055 * @param featuresToExclude
056 * @param indexLocation
057 * @param baseTokenAnnotationType
058 * @param indexUnitAnnotationType
059 * @return
060 */
061 public List<Document> createDocuments(String corpusPersistenceID,
062 gate.Document gateDoc, String documentID,
063 ArrayList<String> annotSetsToInclude, ArrayList<String> annotSetsToExclude,
064 ArrayList<String> featuresToInclude, ArrayList<String> featuresToExclude,
065 String indexLocation, String baseTokenAnnotationType,
066 Boolean createTokensAutomatically, String indexUnitAnnotationType) {
067
068 if(baseTokenAnnotationType != null)
069 baseTokenAnnotationType = baseTokenAnnotationType.trim();
070
071 ArrayList<Document> toReturnBack = new ArrayList<Document>();
072 ArrayList<String> annotSetsToIndex = new ArrayList<String>();
073
074 // by default merge set must be created
075 //boolean createMergeSet = true;
076
077 // if user has provided annotation sets to include, we don't bother
078 // about annotation sets to exclude
079 if(annotSetsToInclude.size() > 0) {
080 annotSetsToIndex = annotSetsToInclude;
081
082 // if there's only one annotation to index, we don't need to
083 // create a MergeSet
084 //if(annotSetsToIndex.size() == 1) createMergeSet = false;
085 }
086 else if(annotSetsToExclude.size() > 0) {
087 // if there were no annotation sets to include, check if user has
088 // provided any annotation sets to exclude
089 // if so, we need to index all annotation sets but provided in the
090 // annotationsetstoexclude list
091
092 Set<String> namedAnnotSets = new HashSet<String>();
093 if(gateDoc.getNamedAnnotationSets() != null
094 && gateDoc.getNamedAnnotationSets().keySet() != null) {
095 namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
096 }
097
098 for(String setName : namedAnnotSets) {
099 if(annotSetsToExclude.contains(setName)) continue;
100 annotSetsToIndex.add(setName);
101 }
102
103 if(!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
104 annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
105 }
106 }
107 else {
108 // if both annotation sets to include and annotation sets to
109 // exclude are empty
110 // we need to index all annotation sets
111 Set<String> namedAnnotSets = new HashSet<String>();
112 if(gateDoc.getNamedAnnotationSets() != null
113 && gateDoc.getNamedAnnotationSets().keySet() != null) {
114 namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
115 }
116
117 for(String setName : namedAnnotSets) {
118 annotSetsToIndex.add(setName);
119 }
120 annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
121 }
122
123 // lets find out the annotation set that contains tokens in it
124 AnnotationSet baseTokenAnnotationSet = null;
125
126 // search in annotation sets to find out which of them has the
127 // baseTokenAnnotationType annotations
128 // initially this is set to false
129 boolean searchBaseTokensInAllAnnotationSets = false;
130 boolean searchIndexUnitInAllAnnotationSets = false;
131
132 // this variable tells whether we want to create manual tokens or
133 // not
134 boolean createManualTokens = false;
135
136 // lets check if user's input is setName.basetokenAnnotationType
137 int index = -1;
138 if(baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
139 index = baseTokenAnnotationType.lastIndexOf('.');
140
141 // yes it is, find out the annotationset name and the
142 // basetokenAnnotationType
143 if(index >= 0) {
144
145 // set name
146 String setName = baseTokenAnnotationType.substring(0, index);
147
148 // token type
149 baseTokenAnnotationType =
150 baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType
151 .length());
152
153 // check if user has asked to take tokens from the default
154 // annotation set
155 if(setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
156 baseTokenAnnotationSet =
157 gateDoc.getAnnotations().get(baseTokenAnnotationType);
158 else baseTokenAnnotationSet =
159 gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
160
161 // here we check if the baseTokenAnnotationSet is null or its size
162 // is 0
163 // if so, we'll have to find out in all annotation sets for the
164 // base token annotation type
165 if(baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
166 System.err.println("Base Tokens " + baseTokenAnnotationType
167 + " counldn't be found under the specified annotation set " + setName
168 + "\n searching them in other annotation sets");
169 searchBaseTokensInAllAnnotationSets = true;
170 }
171 }
172 else {
173
174 // either baseTokenAnnotation type is null or user hasn't provided
175 // any annotaiton set name
176 // so we search in all annotation sets
177 searchBaseTokensInAllAnnotationSets = true;
178 }
179
180 // if(searchBaseTokensInAllAnnotationSets) {
181 // System.out.println("Searching for the base token annotation type \""
182 // + baseTokenAnnotationType + "\"in all sets");
183 // }
184
185 if(baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0
186 && searchBaseTokensInAllAnnotationSets) {
187 // we set this to true and if we find basetokens in any of the
188 // annotationsets to index
189 // we will set this to false
190 createManualTokens = true;
191
192 for(String aSet : annotSetsToIndex) {
193 if(aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
194 AnnotationSet tempSet =
195 gateDoc.getAnnotations().get(baseTokenAnnotationType);
196 if(tempSet.size() > 0) {
197 baseTokenAnnotationSet = tempSet;
198 // System.out.println("found in default annotation set");
199 createManualTokens = false;
200 break;
201 }
202 }
203 else {
204 AnnotationSet tempSet =
205 gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
206 if(tempSet.size() > 0) {
207 baseTokenAnnotationSet = tempSet;
208 // System.out.println("found in "+aSet);
209 createManualTokens = false;
210 break;
211 }
212 }
213 }
214 }
215
216 // if baseTokenAnnotaitonType is null or an empty string
217 // we'll have to create tokens ourselves
218 if(baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
219 createManualTokens = true;
220
221 // lets check if we have to create ManualTokens
222 if(createManualTokens) {
223 if(!createTokensAutomatically.booleanValue()) {
224 System.out
225 .println("Tokens couldn't be found in the document - Ignoring the document "
226 + gateDoc.getName());
227 return null;
228 }
229
230 baseTokenAnnotationType = Constants.ANNIC_TOKEN;
231
232 if(baseTokenAnnotationSet == null) {
233 baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
234 }
235
236 if(!createTokens(gateDoc, baseTokenAnnotationSet)) {
237 System.out
238 .println("Tokens couldn't be created manually - Ignoring the document "
239 + gateDoc.getName());
240 return null;
241 }
242 }
243 // by now, baseTokenAnnotationSet will not be null for sure and we
244 // know what's the baseTokenAnnotationType
245
246 // lets find out the annotation set that contains
247 // indexUnitAnnotationType in it
248 AnnotationSet indexUnitAnnotationSet = null;
249
250 // lets check if user has provided setName.indexUnitAnnotationType
251 index = -1;
252 if(indexUnitAnnotationType != null
253 && indexUnitAnnotationType.trim().length() > 0)
254 index = indexUnitAnnotationType.lastIndexOf('.');
255
256 // yes he has, so lets go and fethc setName and
257 // indexUnitAnnotationType
258 if(index >= 0) {
259 // setName
260 String setName = indexUnitAnnotationType.substring(0, index);
261
262 // indexUnitAnnotationType
263 indexUnitAnnotationType =
264 indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType
265 .length());
266
267 if(setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
268 indexUnitAnnotationSet =
269 gateDoc.getAnnotations().get(indexUnitAnnotationType);
270 else indexUnitAnnotationSet =
271 gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
272
273 // here we check if the indexUnitAnnotationSet is null or its size
274 // is 0
275 // if so, we'll have to search other annotation sets
276 if(indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
277 System.err.println("Index Unit " + indexUnitAnnotationType
278 + " counldn't be found under the specified annotation set " + setName
279 + "\n searching them in other annotation sets");
280 searchIndexUnitInAllAnnotationSets = true;
281 }
282 }
283 else {
284
285 // either indexUnitAnnotationType is null or user hasn't provided
286 // the setname
287 searchIndexUnitInAllAnnotationSets = true;
288 }
289
290 // searching in all annotation set names
291 if(indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0
292 && searchIndexUnitInAllAnnotationSets) {
293 for(String aSet : annotSetsToIndex) {
294 if(aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
295 AnnotationSet tempSet =
296 gateDoc.getAnnotations().get(indexUnitAnnotationType);
297 if(tempSet.size() > 0) {
298 indexUnitAnnotationSet = tempSet;
299 break;
300 }
301 }
302 else {
303 AnnotationSet tempSet =
304 gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
305 if(tempSet.size() > 0) {
306 indexUnitAnnotationSet = tempSet;
307 break;
308 }
309 }
310 }
311 }
312
313 // if indexUnitAnnotationSet is null, we set indexUnitAnnotationType
314 // to null as well
315 if(indexUnitAnnotationSet == null) {
316 indexUnitAnnotationType = null;
317 }
318
319 int j = 0;
320
321 // we maintain an annotation set that contains all annotations from
322 // all the annotation sets to be indexed
323 // however it must not contain the baseTokens or
324 // indexUnitAnnotationType annotations
325 AnnotationSet mergedSet = null;
326
327 for(String annotSet : annotSetsToIndex) {
328
329 // we need to generate the Token Stream here, and send it to the
330 // GateLuceneReader
331 AnnotationSet aSetToIndex =
332 annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc
333 .getAnnotations() : gateDoc.getAnnotations(annotSet);
334
335 Set<String> indexedFeatures = new HashSet<String>();
336 // tempBaseTokenAnnotationSet is not null
337 ArrayList<Token>[] tokenStreams =
338 getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude,
339 baseTokenAnnotationType, baseTokenAnnotationSet,
340 indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
341
342 // if there was some problem inside obtaining tokens
343 // tokenStream is set to null
344 if(tokenStreams == null) return null;
345
346 // this is enabled only if there are more than one annotation sets
347 // available to search in
348 // if(createMergeSet) {
349 // if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
350 //
351 // // we need to merge all annotations but the
352 // // baseTokenAnnotationType
353 // for(String aType : aSetToIndex.getAllTypes()) {
354 //
355 // if(aType.equals(baseTokenAnnotationType)) {
356 // continue;
357 // }
358 //
359 // if(indexUnitAnnotationType != null
360 // && aType.equals(indexUnitAnnotationType)) {
361 // continue;
362 // }
363 //
364 // for(Annotation a : aSetToIndex.get(aType)) {
365 // try {
366 // mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
367 // .getOffset(), a.getType(), a.getFeatures());
368 // }
369 // catch(InvalidOffsetException ioe) {
370 // throw new GateRuntimeException(ioe);
371 // }
372 // }
373 //
374 // }
375 // }
376
377 StringBuffer indexedFeaturesString = new StringBuffer();
378 for(String aFeat : indexedFeatures) {
379 indexedFeaturesString.append(aFeat + ";");
380 }
381
382 Document[] toReturn = new Document[tokenStreams.length];
383
384 for(int i = 0; i < tokenStreams.length; i++, j++) {
385 // make a new, empty document
386 Document doc = new Document();
387
388 // and then create the document
389 LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
390 doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
391 doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE,
392 documentID + "-" + j));
393 doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString
394 .substring(0, indexedFeaturesString.length() - 1)));
395
396 if(corpusPersistenceID != null)
397 doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
398 doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
399
400 doc.add(Field.Text("contents", reader));
401 // here we store token stream on the file system
402 try {
403 writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j,
404 indexLocation);
405 }
406 catch(Exception e) {
407 Err.println("\nIgnoring the document : " + gateDoc.getName()
408 + " since its token stream cannot be written on the disk");
409 Err.println("Reason: " + e.getMessage());
410 return null;
411 }
412
413 // return the document
414 toReturn[i] = doc;
415 }
416
417 toReturnBack.addAll(Arrays.asList(toReturn));
418 }
419
420 // // once again do an index with everything merged all together
421 // if(createMergeSet && mergedSet != null) {
422 // Set<String> indexedFeatures = new HashSet<String>();
423 // ArrayList<Token>[] tokenStreams =
424 // getTokens(gateDoc, mergedSet, featuresToInclude, featuresToExclude,
425 // baseTokenAnnotationType, baseTokenAnnotationSet,
426 // indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
427 //
428 // if(tokenStreams == null) return null;
429 //
430 // Document[] toReturn = new Document[tokenStreams.length];
431 //
432 // for(int i = 0; i < tokenStreams.length; i++, j++) {
433 // // make a new, empty document
434 // Document doc = new Document();
435 //
436 // // and then create the document
437 // LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
438 // doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
439 // doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE,
440 // documentID + "-" + j));
441 // StringBuffer indexedFeaturesString = new StringBuffer();
442 // for(String aFeat : indexedFeatures) {
443 // indexedFeaturesString.append(aFeat + ";");
444 // }
445 // doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString
446 // .substring(0, indexedFeaturesString.length() - 1)));
447 //
448 // if(corpusPersistenceID != null)
449 // doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
450 // doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID,
451 // Constants.COMBINED_SET));
452 //
453 // doc.add(Field.Text("contents", reader));
454 // // here we store token stream on the file system
455 // try {
456 // writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j,
457 // indexLocation);
458 // }
459 // catch(Exception e) {
460 // Err.println("\nIgnoring the document : " + gateDoc.getName()
461 // + " since its token stream cannot be written on the disk");
462 // Err.println("Reason: " + e.getMessage());
463 // return null;
464 // }
465 //
466 // // return the document
467 // toReturn[i] = doc;
468 // }
469 //
470 // toReturnBack.addAll(Arrays.asList(toReturn));
471 // }
472
473 return toReturnBack;
474 }
475
476 private boolean createTokens(gate.Document gateDocument, AnnotationSet set) {
477 String gateContent = gateDocument.getContent().toString();
478 int start = -1;
479 for(int i = 0; i < gateContent.length(); i++) {
480 char c = gateContent.charAt(i);
481 if(Character.isWhitespace(c)) {
482 if(start != -1) {
483 FeatureMap features = gate.Factory.newFeatureMap();
484 String string = gateContent.substring(start, i);
485 if(string.trim().length() > 0) {
486 features.put("string", string);
487 try {
488 set.add(new Long(start), new Long(i), Constants.ANNIC_TOKEN,
489 features);
490 }
491 catch(InvalidOffsetException ioe) {
492 ioe.printStackTrace();
493 return false;
494 }
495 }
496 start = i + 1;
497 }
498 }
499 else {
500 if(start == -1) start = i;
501 }
502 }
503 if(start == -1) return false;
504 if(start < gateContent.length()) {
505 FeatureMap features = gate.Factory.newFeatureMap();
506 String string = gateContent.substring(start, gateContent.length());
507 if(string.trim().length() > 0) {
508 features.put("string", string);
509 try {
510 set.add(new Long(start), new Long(gateContent.length()),
511 Constants.ANNIC_TOKEN, features);
512 }
513 catch(InvalidOffsetException ioe) {
514 ioe.printStackTrace();
515 return false;
516 }
517 }
518 }
519 return true;
520 }
521
522 /**
523 * Some file names are not compatible to the underlying file system. This
524 * method replaces all those incompatible characters with '_'.
525 *
526 * @param name
527 * @return
528 */
529 private String getCompatibleName(String name) {
530 return name.replaceAll("[\\/:\\*\\?\"<>|]", "_");
531 }
532
533 /**
534 * This method, given a tokenstream and file name, writes the tokenstream on
535 * the provided location.
536 *
537 * @param tokenStream
538 * @param fileName
539 * @param location
540 * @throws Exception
541 */
542 private void writeOnDisk(ArrayList tokenStream, String folderName,
543 String fileName, String location) throws Exception {
544
545 // before we write it on a disk, we need to change its name to
546 // underlying file system name
547 fileName = getCompatibleName(fileName);
548 folderName = getCompatibleName(folderName);
549
550 if(location.startsWith("file:/"))
551 location = location.substring(6, location.length());
552
553 if(location.charAt(1) != ':') location = "/" + location;
554
555 File locationFile = new File(location);
556 File folder = new File(locationFile, Constants.SERIALIZED_FOLDER_NAME);
557 if(!folder.exists()) {
558 folder.mkdirs();
559 }
560 if(!folder.exists()) { throw new IOException(
561 "Directory could not be created :" + folder.getAbsolutePath()); }
562
563 folder = new File(folder, folderName);
564 if(!folder.exists()) {
565 folder.mkdirs();
566 }
567
568 if(!folder.exists()) { throw new IOException(
569 "Directory could not be created :" + folder.getAbsolutePath()); }
570
571 File outputFile = new File(folder, fileName + ".annic");
572 ObjectOutput output = null;
573 OutputStream file = new FileOutputStream(outputFile);
574 OutputStream buffer = new BufferedOutputStream(file);
575 output = new ObjectOutputStream(buffer);
576 output.writeObject(tokenStream);
577 if(output != null) {
578 output.close();
579 }
580 }
581
582 /**
583 * Internal class used for storing the offsets of annotations.
584 *
585 * @author niraj
586 *
587 */
588 private class OffsetGroup {
589 Long startOffset;
590
591 Long endOffset;
592 }
593
594 /**
595 * This method given a GATE document and other required parameters, for each
596 * annotation of type indexUnitAnnotationType creates a separate list of
597 * baseTokens underlying in it.
598 *
599 * @param document
600 * @param inputAs
601 * @param featuresToExclude
602 * @param baseTokenAnnotationType
603 * @param indexUnitAnnotationType
604 * @return
605 */
606 private ArrayList<Token>[] getTokens(gate.Document document,
607 AnnotationSet inputAs, ArrayList<String> featuresToInclude,
608 ArrayList<String> featuresToExclude, String baseTokenAnnotationType,
609 AnnotationSet baseTokenSet, String indexUnitAnnotationType,
610 AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
611
612 boolean excludeFeatures = false;
613 boolean includeFeatures = false;
614
615 // if include features are provided, we donot look at the exclude
616 // features
617 if(!featuresToInclude.isEmpty()) {
618 includeFeatures = true;
619 }
620 else if(!featuresToExclude.isEmpty()) {
621 excludeFeatures = true;
622 }
623
624 HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
625 if(indexUnitAnnotationType == null
626 || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null
627 || indexUnitSet.size() == 0) {
628 // the index Unit Annotation Type is not specified
629 // therefore we consider the entire document as a single unit
630 OffsetGroup group = new OffsetGroup();
631 group.startOffset = new Long(0);
632 group.endOffset = document.getContent().size();
633 unitOffsetsSet.add(group);
634 }
635 else {
636 Iterator<Annotation> iter = indexUnitSet.iterator();
637 while(iter.hasNext()) {
638 Annotation annotation = iter.next();
639 OffsetGroup group = new OffsetGroup();
640 group.startOffset = annotation.getStartNode().getOffset();
641 group.endOffset = annotation.getEndNode().getOffset();
642 unitOffsetsSet.add(group);
643 }
644 }
645
646 Set<String> allTypes = new HashSet<String>();
647
648 for(String aType : inputAs.getAllTypes()) {
649 if(aType.indexOf(".") > -1 || aType.indexOf("=") > -1
650 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
651 System.err
652 .println("Annotations of type "
653 + aType
654 + " cannot be indexed as the type name contains one of the ., =, or ; character");
655 continue;
656 }
657 allTypes.add(aType);
658 }
659
660 if(baseTokenSet != null && baseTokenSet.size() > 0) {
661 allTypes.remove(baseTokenAnnotationType);
662 }
663
664 if(indexUnitSet != null && indexUnitSet.size() > 0)
665 allTypes.remove(indexUnitAnnotationType);
666
667 AnnotationSet toUseSet = new AnnotationSetImpl(document);
668 for(String type : allTypes) {
669 for(Annotation a : inputAs.get(type)) {
670 try {
671 toUseSet.add(a.getStartNode().getOffset(),
672 a.getEndNode().getOffset(), a.getType(), a.getFeatures());
673 }
674 catch(InvalidOffsetException ioe) {
675 throw new GateRuntimeException(ioe);
676 }
677 }
678 }
679
680 ArrayList<Token> toReturn[] = new ArrayList[unitOffsetsSet.size()];
681 Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
682 int counter = 0;
683 while(iter.hasNext()) {
684 OffsetGroup group = iter.next();
685 ArrayList<Token> newTokens = new ArrayList<Token>();
686 ArrayList<Annotation> tokens =
687 new ArrayList<Annotation>(toUseSet.getContained(group.startOffset,
688 group.endOffset));
689
690 // add tokens from the baseTokenSet
691 if(baseTokenSet != null && baseTokenSet.size() != 0) {
692 tokens.addAll(baseTokenSet.getContained(group.startOffset,
693 group.endOffset));
694 }
695
696 if(tokens == null || tokens.size() == 0) return null;
697
698 Collections.sort(tokens, new OffsetComparator());
699
700 int position = -1;
701 for(int i = 0; i < tokens.size(); i++) {
702 byte inc = 1;
703 Annotation annot = tokens.get(i);
704 String type = annot.getType();
705 // if the feature is specified in featuresToExclude -exclude it
706 if(excludeFeatures && featuresToExclude.contains(type)) continue;
707
708 // if the feature is not sepcified in the include features -
709 // exclude it
710 if(includeFeatures && !featuresToInclude.contains(type)) continue;
711
712 int startOffset = annot.getStartNode().getOffset().intValue();
713 int endOffset = annot.getEndNode().getOffset().intValue();
714 String text =
715 document.getContent().toString().substring(startOffset, endOffset);
716 if(text == null) {
717 continue;
718 }
719
720
721 Token token1 = new Token(type, startOffset, endOffset, "*");
722
723 // each token has four values
724 // String, int, int, String
725 // we add extra info of position
726 if(i > 0) {
727 if(annot.getStartNode().getOffset().longValue() == tokens.get(i - 1)
728 .getStartNode().getOffset().longValue()) {
729 token1.setPositionIncrement(0);
730 inc = 0;
731 }
732 }
733
734 position += inc;
735 token1.setPosition(position);
736 newTokens.add(token1);
737
738 if(!type.equals(baseTokenAnnotationType)
739 || (annot.getFeatures().get("string") == null)) {
740 // we need to create one string feature for this
741 Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
742 indexedFeatures.add(type + ".string");
743 tk1.setPositionIncrement(0);
744 tk1.setPosition(position);
745 newTokens.add(tk1);
746 }
747
748 // now find out the features and add them
749 FeatureMap features = annot.getFeatures();
750 Iterator fIter = features.keySet().iterator();
751 while(fIter.hasNext()) {
752 String type1 = (String)fIter.next();
753 // if the feature is specified in featuresToExclude -exclude
754 // it
755 if(excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
756 continue;
757 }
758
759 // if the feature is not sepcified in the include features -
760 // exclude it
761 if(includeFeatures && !featuresToInclude.contains(type + "." + type1))
762 continue;
763
764 Object tempText = features.get(type1);
765 if(tempText == null) continue;
766
767 String text1 = (String)tempText.toString();
768 // we need to qualify the type names
769 // for each annotation type feature we add AT.Feature=="**" to be able
770 // to search for it
771 // to calculate stats
772
773 Token tempToken =
774 new Token(text1, startOffset, endOffset, type + "." + type1);
775 indexedFeatures.add(type + "." + type1);
776 tempToken.setPositionIncrement(0);
777 tempToken.setPosition(position);
778 newTokens.add(tempToken);
779
780 Token onlyATFeature =
781 new Token(type + "." + type1, startOffset, endOffset, "**");
782 onlyATFeature.setPosition(position);
783 onlyATFeature.setPositionIncrement(0);
784 newTokens.add(onlyATFeature);
785
786 }
787 }
788 toReturn[counter] = newTokens;
789 counter++;
790 }
791 return toReturn;
792 }
793 }
|