BasicAnnotationOrthography.java
001 package gate.creole.orthomatcher;
002 
003 import java.io.BufferedReader;
004 import java.io.IOException;
005 import java.io.InputStreamReader;
006 import java.net.URL;
007 import java.util.ArrayList;
008 import java.util.Arrays;
009 import java.util.Collections;
010 import java.util.HashMap;
011 import java.util.HashSet;
012 import java.util.Iterator;
013 import java.util.List;
014 import java.util.Map;
015 import java.util.Set;
016 import java.util.regex.Pattern;
017 
018 import org.apache.log4j.Logger;
019 
020 import gate.Annotation;
021 import gate.AnnotationSet;
022 import gate.Document;
023 import gate.Factory;
024 import gate.FeatureMap;
025 import gate.creole.ExecutionException;
026 import gate.util.BomStrippingInputStreamReader;
027 import gate.util.Err;
028 import gate.util.InvalidOffsetException;
029 
030 import static gate.creole.ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME;
031 import static gate.creole.ANNIEConstants.LOOKUP_ANNOTATION_TYPE;
032 import static gate.creole.ANNIEConstants.PERSON_GENDER_FEATURE_NAME;
033 
034 import static gate.creole.orthomatcher.OrthoMatcherHelper.*;
035 
036 /*
037  * This class defines an orthography which defines the primary behavior of the Orthomatcher processing
038  * resource in GATE.
039  */
040 public class BasicAnnotationOrthography implements AnnotationOrthography {
041 
042   private final boolean extLists;
043   private final String personType;
044   private final String unknownType;
045   private Map<String,HashSet<String>> nicknameMap = new HashMap<String, HashSet<String>>();
046   private final Double minimumNicknameLikelihood;
047 
048   public BasicAnnotationOrthography(String personType, boolean extLists,
049           String unknownType, URL nicknameFile, Double minimumNicknameLikelihood, String encoding) {
050     this.personType = personType;
051     this.extLists = extLists;
052     this.unknownType=unknownType;
053     this.minimumNicknameLikelihood = minimumNicknameLikelihood;
054     try {
055       if (nicknameFile != null)
056         this.initNicknames(encoding, nicknameFile);
057     }
058     catch(IOException e) {
059       log.warn("Could not load nickname map.", e);
060     }
061   }
062 
063   protected static final Logger log = Logger.getLogger(BasicAnnotationOrthography.class);
064 
065 
066   public String getStringForAnnotation(Annotation a, gate.Document dthrows ExecutionException {
067     String annotString = getStringForSpan(a.getStartNode().getOffset(),a.getEndNode().getOffset(), d);
068     // now do the reg. exp. substitutions
069     annotString = annotString.replaceAll("\\s+"" ");
070 
071     return annotString;
072   }
073 
074   public boolean fuzzyMatch (String s1, String s2) {
075 
076     String s1Lower = s1.toLowerCase();
077     String s2Lower = s2.toLowerCase();
078     if (s1Lower.equals(s2Lower)) {
079       return true;
080     }
081     // System.out.println("Now comparing " + s1 + " | " + s2) ;
082     Set<String> formalNameSet = nicknameMap.get(s1Lower);
083     if (formalNameSet != null) {
084       if (formalNameSet.contains(s2Lower)) {
085         return true;
086       }
087     }
088     formalNameSet = nicknameMap.get(s2Lower);
089     if (formalNameSet != null) {
090       if (formalNameSet.contains(s1Lower)) {
091         return true;
092       }
093     }
094     return false;
095   }
096 
097   /**
098    @return true if all of the tokens in firstName are either found in second name or are stop words
099    */
100   public boolean allNonStopTokensInOtherAnnot(ArrayList<Annotation> firstName,ArrayList<Annotation> secondName,String TOKEN_STRING_FEATURE_NAME,boolean caseSensitive) {
101     for (Annotation a : firstName) {
102       if (!a.getFeatures().containsKey("ortho_stop")) {
103         String aString = (Stringa.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
104         boolean foundAMatchInSecond = false;
105         for (Annotation b: secondName) {
106           if (OrthoMatcherHelper.straightCompare(aString,(Stringb.getFeatures().get(TOKEN_STRING_FEATURE_NAME),caseSensitive)) {
107             foundAMatchInSecond = true;
108             break;
109           }
110         }
111         if (!foundAMatchInSecond) {
112           return false;
113         }
114       }
115     }
116     return true;
117   }
118 
119   /**
120    * Return a person name without a title.  Also remove title from global variable
121    * tokensMap
122    */
123   public String stripPersonTitle (String annotString, Annotation annot, Document doc, Map<Integer, List<Annotation>> tokensMap, HashMap normalizedTokensMap,AnnotationSet nameAllAnnots)
124   throws ExecutionException {
125 
126     FeatureMap queryFM = Factory.newFeatureMap();
127 
128     // get the offsets
129     Long startAnnot = annot.getStartNode().getOffset();
130     Long endAnnot = annot.getEndNode().getOffset();
131 
132     // determine "Lookup" annotation set
133     queryFM.clear();
134     queryFM.put("majorType""title");
135     AnnotationSet as1 = nameAllAnnots.getContained(startAnnot,endAnnot);
136     if (as1 == null || as1.isEmpty())
137       return annotString;
138     AnnotationSet as =
139       as1.get("Lookup", queryFM);
140     if (as !=null && ! as.isEmpty()) {
141       List<Annotation> titles = new ArrayList<Annotation>(as);
142       Collections.sort(titles, new gate.util.OffsetComparator());
143 
144       Iterator<Annotation> iter = titles.iterator();
145       while (iter.hasNext()) {
146         Annotation titleAnn = iter.next();
147 
148         //we've not found a title at the start offset,
149         //there's no point in looking further
150         //coz titles come first
151         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot!= 0)
152           return annotString;
153 
154         try {
155           // the title from the current annotation
156           String annotTitle =
157             doc.getContent().getContent(
158                     titleAnn.getStartNode().getOffset(),
159                     titleAnn.getEndNode().getOffset()
160             ).toString();
161 
162           // eliminate the title from annotation string and return the result
163           if (annotTitle.length()<annotString.length()) {
164             //remove from the array of tokens, so then we can compare properly
165             //the remaining tokens
166             //            log.debug("Removing title from: " + annot + " with string " + annotString);
167             //            log.debug("Tokens are " + tokensMap.get(annot.getId()));
168             //            log.debug("Title is " + annotTitle);
169             ((ArrayListtokensMap.get(annot.getId())).remove(0);
170             ((ArrayListnormalizedTokensMap.get(annot.getId())).remove(0);
171             return annotString.substring(
172                     annotTitle.length()+1,annotString.length());
173           }
174         catch (InvalidOffsetException ioe) {
175           throw new ExecutionException
176           ("Invalid offset of the annotation");
177         }//try
178       }// while
179     }//if
180     return annotString;
181 
182   }
183 
184   public boolean matchedAlready(Annotation annot1, Annotation annot2,List matchesDocFeature,AnnotationSet nameAllAnnots) {
185     //the two annotations are already matched if the matches list of the first
186     //contains the id of the second
187     List matchesList = (Listannot1.getFeatures().
188     get(ANNOTATION_COREF_FEATURE_NAME);
189     if ((matchesList == null|| matchesList.isEmpty())
190       return false;
191     else if (matchesList.contains(annot2.getId()))
192       return true;
193     return false;
194   }
195 
196   public Annotation updateMatches(Annotation newAnnot, String annotString,HashMap processedAnnots,AnnotationSet nameAllAnnots,List matchesDocFeature) {
197     Annotation matchedAnnot = null;
198     Integer id;
199 
200     //first find a processed annotation with the same string
201     // TODO: Andrew Borthwick 7/26/08:  The below is very inefficient.  We should be doing a lookup into a hash
202     // which is indexed on string rather than testing every id. Need to have the index be String + Type
203     // for safety
204     Iterator iter = processedAnnots.keySet().iterator();
205     // System.out.println("ID's examined: ");
206     while (iter.hasNext()) {
207       id = (Integeriter.next();
208       String oldString = (StringprocessedAnnots.get(id);
209       // System.out.print(id + " ");
210       if (annotString.equals(oldString)) {
211         Annotation tempAnnot = nameAllAnnots.get(id);
212         if (tempAnnot == null) {
213           log.warn("Orthomatcher: TempAnnot is null when looking at " + annotString
214                   " | " + oldString + " | old id: " + id);
215           return null;
216         }
217         // Below is a new Spock addition to prevent unpredictable behavior when
218         // the same string is given more than one type.  We want to return null
219         // if there is no match on name + type (other than Unknown)
220         if (newAnnot.getType().equals(unknownType||
221                 tempAnnot.getType().equals(newAnnot.getType())) {
222           matchedAnnot = tempAnnot;
223           break;
224         }
225       }
226     }//while
227     // System.out.println();
228 
229     if (matchedAnnot == nullreturn null;
230 
231     List matchesList = (ListmatchedAnnot.getFeatures().
232     get(ANNOTATION_COREF_FEATURE_NAME);
233     if ((matchesList == null|| matchesList.isEmpty()) {
234       //no previous matches, so need to add
235       if (matchesList == null) {
236         matchesList = new ArrayList();
237         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
238                 matchesList);
239         matchesDocFeature.add(matchesList);
240       }//if
241       matchesList.add(matchedAnnot.getId());
242       matchesList.add(newAnnot.getId());
243     else {
244       //just add the new annotation
245       matchesList.add(newAnnot.getId());
246     }//if
247     //add the matches list to the new annotation
248     newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
249     return matchedAnnot;
250   }
251 
252   public void updateMatches(Annotation newAnnot, Annotation prevAnnot,List matchesDocFeature,AnnotationSet nameAllAnnots) {
253 
254     List matchesList = (ListprevAnnot.getFeatures().
255     get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
256     if ((matchesList == null|| matchesList.isEmpty()) {
257       //no previous matches, so need to add
258       if (matchesList == null) {
259         matchesList = new ArrayList();
260         prevAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
261         matchesDocFeature.add(matchesList);
262       }//if
263       matchesList.add(prevAnnot.getId());
264       matchesList.add(newAnnot.getId());
265     else {
266       //just add the new annotation
267       matchesList.add(newAnnot.getId());
268     }//if
269     //add the matches list to the new annotation
270     newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME, matchesList);
271     //propagate the gender if two persons are matched
272     if (prevAnnot.getType().equals(this.personType)) {
273       String prevGender =
274         (StringprevAnnot.getFeatures().get(OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
275       String newGender =
276         (StringnewAnnot.getFeatures().get(OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
277       boolean unknownPrevGender = isUnknownGender(prevGender);
278       boolean unknownNewGender = isUnknownGender(newGender);
279       if (unknownPrevGender && !unknownNewGender)
280         prevAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME, newGender);
281       else if (unknownNewGender && !unknownPrevGender)
282         newAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME, prevGender);
283     }//if
284   }
285 
286   /** Tables for namematch info
287    * (used by the namematch rules)
288    @return
289    */
290   public HashSet buildTables(AnnotationSet nameAllAnnots) {
291 
292     FeatureMap tempMap = Factory.newFeatureMap();
293     //reset the tables first
294     HashSet cdg = new HashSet();
295 
296     if (!extLists) {
297       // i.e. get cdg from Lookup annotations
298       // get all Lookup annotations
299       tempMap.clear();
300       tempMap.put(gate.creole.ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
301       //now get all lookup annotations which are cdg
302       AnnotationSet nameAnnots =
303         nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
304 
305       if ((nameAnnots ==null|| nameAnnots.isEmpty())
306         return cdg;
307 
308       Iterator<Annotation> iter = nameAnnots.iterator();
309       while (iter.hasNext()) {
310         Annotation annot = iter.next();
311         // get the actual string
312         Long offsetStartAnnot = annot.getStartNode().getOffset();
313         Long offsetEndAnnot = annot.getEndNode().getOffset();
314         try {
315           gate.Document doc = nameAllAnnots.getDocument();
316           String annotString =
317             doc.getContent().getContent(
318                     offsetStartAnnot,offsetEndAnnot
319             ).toString();
320           cdg.add(annotString);
321         catch (InvalidOffsetException ioe) {
322           ioe.printStackTrace(Err.getPrintWriter());
323         }
324       }// while
325     }//if
326 
327     return cdg;
328   }//buildTables
329 
330 
331   public boolean isUnknownGender(String gender) {
332     if (gender == null)
333       return true;
334     if (gender.equalsIgnoreCase("male"|| gender.equalsIgnoreCase("female"))
335       return false;
336     return true;
337 
338   //isUnknownGender
339 
340   protected Map<String,HashSet<String>> initNicknames(
341           String nicknameFileEncoding,
342           java.net.URL fileURL
343   throws IOException {
344 
345 
346 
347     Pattern spacePat = Pattern.compile("(\\s+)");
348     nicknameMap = new HashMap<String,HashSet<String>>();
349 
350     //create the relative URL
351     BufferedReader reader = new BomStrippingInputStreamReader(fileURL.openStream(),
352                     nicknameFileEncoding);
353     String lineRead = null;
354     int ctr = 0;
355     while ((lineRead = reader.readLine()) != null){
356       if (lineRead.length() == || lineRead.charAt(0== '#') {
357         continue;
358       }
359 
360       ArrayList<String> nickNameLine =
361         new ArrayList<String>(Arrays.asList(spacePat.split(lineRead.toLowerCase().trim())));
362       if (nickNameLine.size() != &&
363               (nickNameLine.size() != && ((nickNameLine.get(3!= "M"|| nickNameLine.get(3!= "F"))) {
364         continue;
365       }
366       if (round2Places(Double.valueOf(nickNameLine.get(2))) < OrthoMatcherHelper.round2Places(minimumNicknameLikelihood)) {
367         continue;
368       }
369       if (nicknameMap.containsKey(nickNameLine.get(0))) {
370         /*        System.out.println("Adding to existing nickname of " + nickNameLine.get(0) + " "
371               + nickNameLine.get(1));*/
372         nicknameMap.get(nickNameLine.get(0)).add(nickNameLine.get(1));
373       }
374       else {
375         /*          System.out.println("Adding new nickname of " + nickNameLine.get(0) + " "
376                 + nickNameLine.get(1));*/
377         nicknameMap.put(nickNameLine.get(0),
378                 new HashSet<String>(Collections.singleton(nickNameLine.get(1))));
379       }
380 
381     }
382 
383     return nicknameMap;
384   }
385 
386 }