001 package gate.creole;
002
003 import java.io.*;
004 import java.util.*;
005
006 import gate.Annotation;
007 import gate.AnnotationSet;
008 import gate.creole.gazetteer.*;
009 import gate.util.*;
010
011 public class GazetteerListsCollector extends AbstractLanguageAnalyser {
012 private static String PERSON_ANNOT_NAME = "PER";
013
014 public void execute() throws gate.creole.ExecutionException {
015 //reinitialise the stats
016 statsPerType = new HashMap();
017
018 //check the input
019 if(document == null) {
020 throw new ExecutionException(
021 "No document to process!"
022 );
023 }
024
025 if (gazetteer == null) {
026 throw new ExecutionException(
027 "No gazetteer set!"
028 );
029 }
030
031 //if no annotation types given, then exit
032 if ((this.annotationTypes == null) || annotationTypes.isEmpty()) {
033 Out.prln("Gazetteer Lists Collector Warning: No annotation types given for processing");
034 return;
035 }
036
037 // get the annotations from document
038 if ((markupSetName == null)|| (markupSetName.equals("")))
039 allAnnots = document.getAnnotations();
040 else
041 allAnnots = document.getAnnotations(markupSetName);
042
043 //if none found, print warning and exit
044 if ((allAnnots == null) || allAnnots.isEmpty()) {
045 Out.prln("Gazetteer Lists Collector Warning: No annotations found for processing");
046 return;
047 }
048
049 //collect the stats for each annotation type
050 for (int i = 0; i < annotationTypes.size(); i++) {
051 AnnotationSet annots = allAnnots.get((String) annotationTypes.get(i));
052 if (annots == null || annots.isEmpty())
053 continue;
054 statsPerType.put(annotationTypes.get(i), new HashMap());
055 collectLists(annots, (String) annotationTypes.get(i));
056 }
057
058 //print out the stats in log files
059 printStats();
060
061 //save the updated gazetteer lists now
062 Map theLists = gazetteer.getLinearDefinition().getListsByNode();
063 Iterator iter1 = theLists.keySet().iterator();
064 while (iter1.hasNext()) {
065 GazetteerList theList = (GazetteerList) theLists.get(iter1.next());
066 try {
067 if (theList.isModified())
068 theList.store();
069 } catch (ResourceInstantiationException ex) {
070 throw new GateRuntimeException(ex.getMessage());
071 }
072 }
073
074 }
075
076 public void setMarkupASName(String newMarkupASName) {
077 markupSetName = newMarkupASName;
078 }
079
080 public String getMarkupASName() {
081 return markupSetName;
082 }
083
084 /** get the types of the annotation
085 * @return type of the annotation
086 */
087 public List getAnnotationTypes() {
088 return annotationTypes;
089 }//getAnnotationTypes
090
091 /** set the types of the annotations
092 * @param newType
093 */
094 public void setAnnotationTypes(List newType) {
095 annotationTypes = newType;
096 }//setAnnotationTypes
097
098 public Gazetteer getGazetteer() {
099 return gazetteer;
100 }
101
102 public void setGazetteer(Gazetteer theGaz) {
103 gazetteer = theGaz;
104 }
105
106 public void setTheLanguage(String language) {
107 theLanguage = language;
108 }
109
110 public String getTheLanguage() {
111 return theLanguage;
112 }
113
114 protected void collectLists(AnnotationSet annots, String annotType) {
115 Iterator<Annotation> iter = annots.iterator();
116 String listName = "";
117 GazetteerList theList = null;
118 Iterator theListsIter =
119 gazetteer.getLinearDefinition().getListsByNode().values().iterator();
120 while (theListsIter.hasNext() && listName.equals("")) {
121 theList = (GazetteerList) theListsIter.next();
122 if (theList.getURL().toExternalForm().endsWith(annotType + ".lst"))
123 listName = theList.getURL().toExternalForm();
124 }
125 while (iter.hasNext()) {
126 Annotation annot = iter.next();
127 String text = "";
128 List strings = new ArrayList();
129 try {
130 text = document.getContent().getContent(
131 annot.getStartNode().getOffset(),
132 annot.getEndNode().getOffset()
133 ).toString();
134 //tokenise the text and save for the future if we need it
135 StringTokenizer tok = new StringTokenizer(text, "\n\r.|();-?!\t", false);
136 while (tok.hasMoreTokens())
137 strings.add(tok.nextToken());
138 //then replace the line breaks with spaces for the gazetteer
139 text = text.replace('\r', ' ');
140 text = text.replace('\n', ' ');
141 text = text.replace('\t', ' ');
142
143 } catch (InvalidOffsetException ex) {
144 throw new GateRuntimeException(ex.getMessage());
145 }
146
147 //collect stats for the string
148 if (((HashMap) statsPerType.get(annotType)).containsKey(text))
149 ((HashMap) statsPerType.get(annotType)).put(text,
150 new Integer(((Integer)
151 ((HashMap) statsPerType.get(annotType)).get(text)).intValue()+1));
152 else
153 ((HashMap) statsPerType.get(annotType)).put(text, new Integer(1));
154
155 //also collect stats for the individual tokens in the name to identify the most
156 //frequent tokens across names
157 if (strings.size() > 1) {
158 for (int i=0; i < strings.size(); i++) {
159 String theString = (String) strings.get(i);
160 //collect stats for the string
161 if ( ( (HashMap) statsPerType.get(annotType)).containsKey(theString))
162 ( (HashMap) statsPerType.get(annotType)).put(theString,
163 new Integer( ( (Integer)
164 ( (HashMap) statsPerType.get(annotType)).get(
165 theString)).intValue() + 1));
166 else
167 ( (HashMap) statsPerType.get(annotType)).put(theString,
168 new Integer(1));
169 }
170 }
171
172 //first we check whether the text is already in the gazetteer
173 Set lookupResult = gazetteer.lookup(text);
174 if (lookupResult != null && lookupResult.size() > 0)
175 continue;
176 //if not, then we add it
177 gazetteer.add(text,
178 new Lookup(listName, annotType, "inferred", theLanguage));
179 // theList.add(text + document.getSourceUrl().toString());
180 theList.add(text);
181
182
183 //for persons we want also to add their individual names to the list
184 if (annotType.equals(PERSON_ANNOT_NAME) && strings.size() > 1) {
185 for (int i=0; i < strings.size(); i++) {
186 String theString = (String) strings.get(i);
187 Set lookupResult1 = gazetteer.lookup(theString);
188 if (lookupResult1 != null && lookupResult1.size() > 0)
189 continue;
190 if (theString.length() < 3)
191 continue;
192 gazetteer.add(theString,
193 new Lookup(listName, annotType, "inferred", theLanguage));
194 theList.add(theString);
195 }
196 }
197 }
198 }
199
200 protected void printStats() {
201 try {
202 for (int i=0; i < annotationTypes.size(); i++) {
203 if (! statsPerType.containsKey(annotationTypes.get(i)))
204 continue;
205 BufferedWriter writer = new BufferedWriter(
206 new OutputStreamWriter(new FileOutputStream(
207 annotationTypes.get(i) + ".stats.lst"),
208 "UTF-8"));
209 HashMap stats = (HashMap) statsPerType.get(annotationTypes.get(i));
210 Iterator stringsIter = stats.keySet().iterator();
211 while (stringsIter.hasNext()) {
212 String string = (String) stringsIter.next();
213 writer.write(string);
214 writer.write("$");
215 writer.write( ((Integer)stats.get(string)).toString());
216 writer.newLine();
217 }
218 writer.close();
219 }
220 } catch(IOException ioe){
221 throw new RuntimeException(ioe.getMessage());
222 }//try
223
224 }
225
226 /**
227 * The idea is to have this method check if an item
228 * is already present in the gazetteer under this type,
229 * and if so, not to add it. It is not implemented for now.
230 */
231 protected boolean alreadyPresentInGazetteer(String token) {
232 return false;
233 }
234
235 private String markupSetName = "";
236 private AnnotationSet allAnnots;
237 private List annotationTypes;
238 private Gazetteer gazetteer;
239 private String theLanguage = "";
240 private HashMap statsPerType = new HashMap();
241 }
|