001 package gate.creole.orthomatcher;
002
003 import java.io.BufferedReader;
004 import java.io.IOException;
005 import java.io.InputStreamReader;
006 import java.net.URL;
007 import java.util.ArrayList;
008 import java.util.Arrays;
009 import java.util.Collections;
010 import java.util.HashMap;
011 import java.util.HashSet;
012 import java.util.Iterator;
013 import java.util.List;
014 import java.util.Map;
015 import java.util.Set;
016 import java.util.regex.Pattern;
017
018 import org.apache.log4j.Logger;
019
020 import gate.Annotation;
021 import gate.AnnotationSet;
022 import gate.Document;
023 import gate.Factory;
024 import gate.FeatureMap;
025 import gate.creole.ExecutionException;
026 import gate.util.BomStrippingInputStreamReader;
027 import gate.util.Err;
028 import gate.util.InvalidOffsetException;
029
030 import static gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME;
031 import static gate.creole.ANNIEConstants.LOOKUP_ANNOTATION_TYPE;
032 import static gate.creole.ANNIEConstants.PERSON_GENDER_FEATURE_NAME;
033
034 import static gate.creole.orthomatcher.OrthoMatcherHelper.*;
035
036 /*
037 * This class defines an orthography which defines the primary behavior of the Orthomatcher processing
038 * resource in GATE.
039 */
040 public class BasicAnnotationOrthography implements AnnotationOrthography {
041
042 private final boolean extLists;
043 private final String personType;
044 private final String unknownType;
045 private Map<String,HashSet<String>> nicknameMap = new HashMap<String, HashSet<String>>();
046 private final Double minimumNicknameLikelihood;
047
048 public BasicAnnotationOrthography(String personType, boolean extLists,
049 String unknownType, URL nicknameFile, Double minimumNicknameLikelihood, String encoding) {
050 this.personType = personType;
051 this.extLists = extLists;
052 this.unknownType=unknownType;
053 this.minimumNicknameLikelihood = minimumNicknameLikelihood;
054 try {
055 if (nicknameFile != null)
056 this.initNicknames(encoding, nicknameFile);
057 }
058 catch(IOException e) {
059 log.warn("Could not load nickname map.", e);
060 }
061 }
062
063 protected static final Logger log = Logger.getLogger(BasicAnnotationOrthography.class);
064
065
066 public String getStringForAnnotation(Annotation a, gate.Document d) throws ExecutionException {
067 String annotString = getStringForSpan(a.getStartNode().getOffset(),a.getEndNode().getOffset(), d);
068 // now do the reg. exp. substitutions
069 annotString = annotString.replaceAll("\\s+", " ");
070
071 return annotString;
072 }
073
074 public boolean fuzzyMatch (String s1, String s2) {
075
076 String s1Lower = s1.toLowerCase();
077 String s2Lower = s2.toLowerCase();
078 if (s1Lower.equals(s2Lower)) {
079 return true;
080 }
081 // System.out.println("Now comparing " + s1 + " | " + s2) ;
082 Set<String> formalNameSet = nicknameMap.get(s1Lower);
083 if (formalNameSet != null) {
084 if (formalNameSet.contains(s2Lower)) {
085 return true;
086 }
087 }
088 formalNameSet = nicknameMap.get(s2Lower);
089 if (formalNameSet != null) {
090 if (formalNameSet.contains(s1Lower)) {
091 return true;
092 }
093 }
094 return false;
095 }
096
097 /**
098 * @return true if all of the tokens in firstName are either found in second name or are stop words
099 */
100 public boolean allNonStopTokensInOtherAnnot(ArrayList<Annotation> firstName,ArrayList<Annotation> secondName,String TOKEN_STRING_FEATURE_NAME,boolean caseSensitive) {
101 for (Annotation a : firstName) {
102 if (!a.getFeatures().containsKey("ortho_stop")) {
103 String aString = (String) a.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
104 boolean foundAMatchInSecond = false;
105 for (Annotation b: secondName) {
106 if (OrthoMatcherHelper.straightCompare(aString,(String) b.getFeatures().get(TOKEN_STRING_FEATURE_NAME),caseSensitive)) {
107 foundAMatchInSecond = true;
108 break;
109 }
110 }
111 if (!foundAMatchInSecond) {
112 return false;
113 }
114 }
115 }
116 return true;
117 }
118
119 /**
120 * Return a person name without a title. Also remove title from global variable
121 * tokensMap
122 */
123 public String stripPersonTitle (String annotString, Annotation annot, Document doc, Map<Integer, List<Annotation>> tokensMap, HashMap normalizedTokensMap,AnnotationSet nameAllAnnots)
124 throws ExecutionException {
125
126 FeatureMap queryFM = Factory.newFeatureMap();
127
128 // get the offsets
129 Long startAnnot = annot.getStartNode().getOffset();
130 Long endAnnot = annot.getEndNode().getOffset();
131
132 // determine "Lookup" annotation set
133 queryFM.clear();
134 queryFM.put("majorType", "title");
135 AnnotationSet as1 = nameAllAnnots.getContained(startAnnot,endAnnot);
136 if (as1 == null || as1.isEmpty())
137 return annotString;
138 AnnotationSet as =
139 as1.get("Lookup", queryFM);
140 if (as !=null && ! as.isEmpty()) {
141 List<Annotation> titles = new ArrayList<Annotation>(as);
142 Collections.sort(titles, new gate.util.OffsetComparator());
143
144 Iterator<Annotation> iter = titles.iterator();
145 while (iter.hasNext()) {
146 Annotation titleAnn = iter.next();
147
148 //we've not found a title at the start offset,
149 //there's no point in looking further
150 //coz titles come first
151 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
152 return annotString;
153
154 try {
155 // the title from the current annotation
156 String annotTitle =
157 doc.getContent().getContent(
158 titleAnn.getStartNode().getOffset(),
159 titleAnn.getEndNode().getOffset()
160 ).toString();
161
162 // eliminate the title from annotation string and return the result
163 if (annotTitle.length()<annotString.length()) {
164 //remove from the array of tokens, so then we can compare properly
165 //the remaining tokens
166 // log.debug("Removing title from: " + annot + " with string " + annotString);
167 // log.debug("Tokens are " + tokensMap.get(annot.getId()));
168 // log.debug("Title is " + annotTitle);
169 ((ArrayList) tokensMap.get(annot.getId())).remove(0);
170 ((ArrayList) normalizedTokensMap.get(annot.getId())).remove(0);
171 return annotString.substring(
172 annotTitle.length()+1,annotString.length());
173 }
174 } catch (InvalidOffsetException ioe) {
175 throw new ExecutionException
176 ("Invalid offset of the annotation");
177 }//try
178 }// while
179 }//if
180 return annotString;
181
182 }
183
184 public boolean matchedAlready(Annotation annot1, Annotation annot2,List matchesDocFeature,AnnotationSet nameAllAnnots) {
185 //the two annotations are already matched if the matches list of the first
186 //contains the id of the second
187 List matchesList = (List) annot1.getFeatures().
188 get(ANNOTATION_COREF_FEATURE_NAME);
189 if ((matchesList == null) || matchesList.isEmpty())
190 return false;
191 else if (matchesList.contains(annot2.getId()))
192 return true;
193 return false;
194 }
195
196 public Annotation updateMatches(Annotation newAnnot, String annotString,HashMap processedAnnots,AnnotationSet nameAllAnnots,List matchesDocFeature) {
197 Annotation matchedAnnot = null;
198 Integer id;
199
200 //first find a processed annotation with the same string
201 // TODO: Andrew Borthwick 7/26/08: The below is very inefficient. We should be doing a lookup into a hash
202 // which is indexed on string rather than testing every id. Need to have the index be String + Type
203 // for safety
204 Iterator iter = processedAnnots.keySet().iterator();
205 // System.out.println("ID's examined: ");
206 while (iter.hasNext()) {
207 id = (Integer) iter.next();
208 String oldString = (String) processedAnnots.get(id);
209 // System.out.print(id + " ");
210 if (annotString.equals(oldString)) {
211 Annotation tempAnnot = nameAllAnnots.get(id);
212 if (tempAnnot == null) {
213 log.warn("Orthomatcher: TempAnnot is null when looking at " + annotString
214 + " | " + oldString + " | old id: " + id);
215 return null;
216 }
217 // Below is a new Spock addition to prevent unpredictable behavior when
218 // the same string is given more than one type. We want to return null
219 // if there is no match on name + type (other than Unknown)
220 if (newAnnot.getType().equals(unknownType) ||
221 tempAnnot.getType().equals(newAnnot.getType())) {
222 matchedAnnot = tempAnnot;
223 break;
224 }
225 }
226 }//while
227 // System.out.println();
228
229 if (matchedAnnot == null) return null;
230
231 List matchesList = (List) matchedAnnot.getFeatures().
232 get(ANNOTATION_COREF_FEATURE_NAME);
233 if ((matchesList == null) || matchesList.isEmpty()) {
234 //no previous matches, so need to add
235 if (matchesList == null) {
236 matchesList = new ArrayList();
237 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
238 matchesList);
239 matchesDocFeature.add(matchesList);
240 }//if
241 matchesList.add(matchedAnnot.getId());
242 matchesList.add(newAnnot.getId());
243 } else {
244 //just add the new annotation
245 matchesList.add(newAnnot.getId());
246 }//if
247 //add the matches list to the new annotation
248 newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
249 return matchedAnnot;
250 }
251
252 public void updateMatches(Annotation newAnnot, Annotation prevAnnot,List matchesDocFeature,AnnotationSet nameAllAnnots) {
253
254 List matchesList = (List) prevAnnot.getFeatures().
255 get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
256 if ((matchesList == null) || matchesList.isEmpty()) {
257 //no previous matches, so need to add
258 if (matchesList == null) {
259 matchesList = new ArrayList();
260 prevAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
261 matchesDocFeature.add(matchesList);
262 }//if
263 matchesList.add(prevAnnot.getId());
264 matchesList.add(newAnnot.getId());
265 } else {
266 //just add the new annotation
267 matchesList.add(newAnnot.getId());
268 }//if
269 //add the matches list to the new annotation
270 newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
271 //propagate the gender if two persons are matched
272 if (prevAnnot.getType().equals(this.personType)) {
273 String prevGender =
274 (String) prevAnnot.getFeatures().get(OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
275 String newGender =
276 (String) newAnnot.getFeatures().get(OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
277 boolean unknownPrevGender = isUnknownGender(prevGender);
278 boolean unknownNewGender = isUnknownGender(newGender);
279 if (unknownPrevGender && !unknownNewGender)
280 prevAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME, newGender);
281 else if (unknownNewGender && !unknownPrevGender)
282 newAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME, prevGender);
283 }//if
284 }
285
286 /** Tables for namematch info
287 * (used by the namematch rules)
288 * @return
289 */
290 public HashSet buildTables(AnnotationSet nameAllAnnots) {
291
292 FeatureMap tempMap = Factory.newFeatureMap();
293 //reset the tables first
294 HashSet cdg = new HashSet();
295
296 if (!extLists) {
297 // i.e. get cdg from Lookup annotations
298 // get all Lookup annotations
299 tempMap.clear();
300 tempMap.put(gate.creole.ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
301 //now get all lookup annotations which are cdg
302 AnnotationSet nameAnnots =
303 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
304
305 if ((nameAnnots ==null) || nameAnnots.isEmpty())
306 return cdg;
307
308 Iterator<Annotation> iter = nameAnnots.iterator();
309 while (iter.hasNext()) {
310 Annotation annot = iter.next();
311 // get the actual string
312 Long offsetStartAnnot = annot.getStartNode().getOffset();
313 Long offsetEndAnnot = annot.getEndNode().getOffset();
314 try {
315 gate.Document doc = nameAllAnnots.getDocument();
316 String annotString =
317 doc.getContent().getContent(
318 offsetStartAnnot,offsetEndAnnot
319 ).toString();
320 cdg.add(annotString);
321 } catch (InvalidOffsetException ioe) {
322 ioe.printStackTrace(Err.getPrintWriter());
323 }
324 }// while
325 }//if
326
327 return cdg;
328 }//buildTables
329
330
331 public boolean isUnknownGender(String gender) {
332 if (gender == null)
333 return true;
334 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
335 return false;
336 return true;
337
338 } //isUnknownGender
339
340 protected Map<String,HashSet<String>> initNicknames(
341 String nicknameFileEncoding,
342 java.net.URL fileURL
343 ) throws IOException {
344
345
346
347 Pattern spacePat = Pattern.compile("(\\s+)");
348 nicknameMap = new HashMap<String,HashSet<String>>();
349
350 //create the relative URL
351 BufferedReader reader = new BomStrippingInputStreamReader(fileURL.openStream(),
352 nicknameFileEncoding);
353 String lineRead = null;
354 int ctr = 0;
355 while ((lineRead = reader.readLine()) != null){
356 if (lineRead.length() == 0 || lineRead.charAt(0) == '#') {
357 continue;
358 }
359
360 ArrayList<String> nickNameLine =
361 new ArrayList<String>(Arrays.asList(spacePat.split(lineRead.toLowerCase().trim())));
362 if (nickNameLine.size() != 3 &&
363 (nickNameLine.size() != 4 && ((nickNameLine.get(3) != "M") || nickNameLine.get(3) != "F"))) {
364 continue;
365 }
366 if (round2Places(Double.valueOf(nickNameLine.get(2))) < OrthoMatcherHelper.round2Places(minimumNicknameLikelihood)) {
367 continue;
368 }
369 if (nicknameMap.containsKey(nickNameLine.get(0))) {
370 /* System.out.println("Adding to existing nickname of " + nickNameLine.get(0) + " "
371 + nickNameLine.get(1));*/
372 nicknameMap.get(nickNameLine.get(0)).add(nickNameLine.get(1));
373 }
374 else {
375 /* System.out.println("Adding new nickname of " + nickNameLine.get(0) + " "
376 + nickNameLine.get(1));*/
377 nicknameMap.put(nickNameLine.get(0),
378 new HashSet<String>(Collections.singleton(nickNameLine.get(1))));
379 }
380
381 }
382
383 return nicknameMap;
384 }
385
386 }
|