0001 /*
0002 * OrthoMatcher.java
0003 *
0004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
0005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
0006 *
0007 * This file is part of GATE (see http://gate.ac.uk/), and is free
0008 * software, licenced under the GNU Library General Public License,
0009 * Version 2, June 1991 (in the distribution as file licence.html,
0010 * and also available at http://gate.ac.uk/gate/licence.html).
0011 *
0012 * Kalina Bontcheva, 24/August/2001
0013 *
0014 * Major update by Andrew Borthwick of Spock Networks, 11/13/2007 - 8/3/2008:
0015 * 1. matchWithPrevious now searches for matching annotations in order, starting from current and working backwards
0016 * until it finds a match. This compares with the previous behavior, which searched randomly among previous annotations
0017 * for a match (because it used an iterator across an AnnotationSet, whereas now we iterate across an ArrayList<Annotation>)
0018 * 2. We no longer require that identical strings always refer to the same entity. We can correctly match
0019 * the sequence "David Jones ... David ... David Smith ... David" as referring to two people, tying the first
0020 * David to "David Jones" and the second David to "David Smith". Ditto with David Jones .. Mr. Jones ..
0021 * Richard Jones .. Mr. Jones
0022 * 3. We now allow for nickname matches for Persons (David = Dave) via the "fuzzyMatch" method which is referenced
0023 * in some of the matching rules.
0024 * 4. Optional parameter highPrecisionOrgs only allows high precision matches for organizations and
0025 * turns off the riskier rules. Under this option, need to match on something like IBM = IBM Corp.
0026 * 5. Various fixes to a number of rules
0027 *
0028 * $Id: OrthoMatcher.java 8929 2007-07-12 16:49:55Z ian_roberts $
0029 */
0030
0031 package gate.creole.orthomatcher;
0032
0033 import java.io.*;
0034 import java.net.URL;
0035 import java.util.*;
0036 import java.util.regex.Matcher;
0037 import java.util.regex.Pattern;
0038
0039 import org.apache.log4j.Logger;
0040
0041 import gate.*;
0042 import gate.creole.*;
0043 import gate.util.*;
0044
0045 public class OrthoMatcher extends AbstractLanguageAnalyser {
0046 protected static final Logger log = Logger.getLogger(OrthoMatcher.class);
0047
0048 public static final boolean DEBUG = false;
0049
0050 public static final String
0051 OM_DOCUMENT_PARAMETER_NAME = "document";
0052
0053 public static final String
0054 OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
0055
0056 public static final String
0057 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
0058
0059 public static final String
0060 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
0061
0062 public static final String
0063 OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
0064
0065 public static final String
0066 OM_PERSON_TYPE_PARAMETER_NAME = "personType";
0067
0068 public static final String
0069 OM_EXT_LISTS_PARAMETER_NAME = "extLists";
0070
0071 protected static final String CDGLISTNAME = "cdg";
0072 protected static final String ALIASLISTNAME = "alias";
0073 protected static final String ARTLISTNAME = "def_art";
0074 protected static final String PREPLISTNAME = "prepos";
0075 protected static final String CONNECTORLISTNAME = "connector";
0076 protected static final String SPURLISTNAME = "spur_match";
0077
0078 protected static final String PUNCTUATION_VALUE = "punctuation";
0079 protected static final String THE_VALUE = "The";
0080
0081
0082 /**the name of the annotation set*/
0083 protected String annotationSetName;
0084
0085 /** the types of the annotation */
0086 protected List annotationTypes = new ArrayList(10);
0087
0088 /** the organization type*/
0089 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
0090
0091 /** the person type*/
0092 protected String personType = PERSON_ANNOTATION_TYPE;
0093
0094 protected String unknownType = "Unknown";
0095
0096 /** internal or external list */
0097 protected boolean extLists = true;
0098
0099 /** Use only high precision rules for Organizations */
0100 protected Boolean highPrecisionOrgs = false;
0101
0102 /** matching unknowns or not*/
0103 protected boolean matchingUnknowns = true;
0104
0105 /** This is an internal variable to indicate whether
0106 * we matched using a rule that requires that
0107 * the newly matched annotation matches all the others
0108 * This is needed, because organizations can share
0109 * first/last tokens like News and be different
0110 */
0111 protected boolean allMatchingNeeded = false;
0112
0113 //** Orthomatching is not case-sensitive by default*/
0114 protected boolean caseSensitive = false;
0115
0116 //protected FeatureMap queryFM = Factory.newFeatureMap();
0117
0118 // name lookup tables (used for namematch)
0119 //gave them bigger default size, coz rehash is expensive
0120 protected HashMap alias = new HashMap(100);
0121 protected HashSet cdg = new HashSet();
0122 protected HashMap spur_match = new HashMap(100);
0123 protected HashMap def_art = new HashMap(20);
0124 protected HashMap connector = new HashMap(20);
0125 protected HashMap prepos = new HashMap(30);
0126
0127
0128 protected AnnotationSet nameAllAnnots = null;
0129
0130 protected HashMap processedAnnots = new HashMap(150);
0131 protected HashMap annots2Remove = new HashMap(75);
0132 protected List matchesDocFeature = new ArrayList();
0133 //maps annotation ids to array lists of tokens
0134 protected HashMap tokensMap = new HashMap(150);
0135 public HashMap getTokensMap() {
0136 return tokensMap;
0137 }
0138
0139 protected HashMap normalizedTokensMap = new HashMap(150);
0140
0141 protected Annotation shortAnnot;
0142 protected Annotation longAnnot;
0143
0144 protected ArrayList<Annotation> tokensLongAnnot;
0145 protected ArrayList<Annotation> tokensShortAnnot;
0146
0147 protected ArrayList<Annotation> normalizedTokensLongAnnot, normalizedTokensShortAnnot;
0148
0149 /**
0150 * URL to the file containing the definition for this orthomatcher
0151 */
0152 private java.net.URL definitionFileURL;
0153
0154 private Double minimumNicknameLikelihood;
0155
0156 /** The encoding used for the definition file and associated lists.*/
0157 private String encoding;
0158
0159 private Map<Integer,OrthoMatcherRule> rules=new HashMap<Integer,OrthoMatcherRule>();
0160
0161 /** to be initialized in init() */
0162 private AnnotationOrthography orthoAnnotation;
0163
0164 /** @link dependency */
0165 /*#OrthoMatcher lnkOrthoMatcher;*/
0166
0167 public OrthoMatcher () {
0168 annotationTypes.add(organizationType);
0169 annotationTypes.add(personType);
0170 annotationTypes.add("Location");
0171 annotationTypes.add("Date");
0172 }
0173
0174 /** Initialise the rules. The orthomatcher loads its build-in rules. */
0175 private void initRules(){
0176 //this line should be executed after spur_match is loaded
0177 rules.put(0, new MatchRule0(this));
0178 rules.put(1, new MatchRule1(this));
0179 rules.put(2, new MatchRule2(this));
0180 rules.put(3, new MatchRule3(this));
0181 rules.put(4, new MatchRule4(this));
0182 rules.put(5, new MatchRule5(this));
0183 rules.put(6, new MatchRule6(this));
0184 rules.put(7, new MatchRule7(this));
0185 rules.put(8, new MatchRule8(this));
0186 rules.put(9, new MatchRule9(this));
0187 rules.put(10, new MatchRule10(this));
0188 rules.put(11, new MatchRule11(this));
0189 rules.put(12, new MatchRule12(this));
0190 rules.put(13, new MatchRule13(this));
0191 rules.put(14, new MatchRule14(this));
0192 rules.put(15, new MatchRule15(this));
0193 rules.put(16, new MatchRule16(this));
0194 rules.put(17, new MatchRule17(this));
0195
0196 }
0197
0198 /** Override this method to add, replace, remove rules */
0199 protected void modifyRules(Map<Integer,OrthoMatcherRule> rules) {
0200
0201 }
0202
0203 /** Initialise this resource, and return it. */
0204 public Resource init() throws ResourceInstantiationException {
0205 //initialise the list of annotations which we will match
0206 if(definitionFileURL == null){
0207 throw new ResourceInstantiationException(
0208 "No URL provided for the definition file!");
0209 }
0210 String nicknameFile = null;
0211
0212 //at this point we have the definition file
0213 try{
0214 BufferedReader reader = new BomStrippingInputStreamReader(definitionFileURL.openStream(),
0215 encoding);
0216 String lineRead = null;
0217 //boolean foundANickname = false;
0218 while ((lineRead = reader.readLine()) != null){
0219 int index = lineRead.indexOf(":");
0220 if (index != -1){
0221 String nameFile = lineRead.substring(0,index);
0222 String nameList = lineRead.substring(index+1,lineRead.length());
0223 if (nameList.equals("nickname")) {
0224 if (minimumNicknameLikelihood == null) {
0225 throw new ResourceInstantiationException("No value for the required parameter minimumNicknameLikelihood!");
0226 }
0227 nicknameFile = nameFile;
0228 }
0229 else {
0230 createAnnotList(nameFile,nameList);
0231 }
0232 }// if
0233 }//while
0234 reader.close();
0235
0236 URL nicknameURL = null;
0237 if (nicknameFile != null)
0238 nicknameURL = new URL(definitionFileURL, nicknameFile);
0239 this.orthoAnnotation = new BasicAnnotationOrthography(
0240 personType,extLists,unknownType,nicknameURL,
0241 minimumNicknameLikelihood, encoding);
0242 initRules();
0243 modifyRules(rules);
0244
0245 }catch(IOException ioe){
0246 throw new ResourceInstantiationException(ioe);
0247 }
0248
0249
0250 return this;
0251 } // init()
0252
0253
0254 /** Run the resource. It doesn't make sense not to override
0255 * this in subclasses so the default implementation signals an
0256 * exception.
0257 */
0258 public void execute() throws ExecutionException{
0259 try{
0260 //check the input
0261 if(document == null) {
0262 throw new ExecutionException(
0263 "No document for namematch!"
0264 );
0265 }
0266 fireStatusChanged("OrthoMatcher processing: " + document.getName());
0267
0268 // get the annotations from document
0269 if ((annotationSetName == null)|| (annotationSetName.equals("")))
0270 nameAllAnnots = document.getAnnotations();
0271 else
0272 nameAllAnnots = document.getAnnotations(annotationSetName);
0273
0274 //if none found, print warning and exit
0275 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
0276 Out.prln("OrthoMatcher Warning: No annotations found for processing");
0277 return;
0278 }
0279
0280 //check if we've been run on this document before
0281 //and clean the doc if needed
0282 docCleanup();
0283 Map matchesMap = (Map)document.getFeatures().
0284 get(DOCUMENT_COREF_FEATURE_NAME);
0285
0286
0287 // creates the cdg list from the document
0288 //no need to create otherwise, coz already done in init()
0289 if (!extLists)
0290 cdg=orthoAnnotation.buildTables(nameAllAnnots);
0291
0292
0293 //Match all name annotations and unknown annotations
0294 matchNameAnnotations();
0295
0296 //used to check if the Orthomatcher works properly
0297 //OrthoMatcherHelper.setMatchesPositions(nameAllAnnots);
0298
0299 // set the matches of the document
0300 // determineMatchesDocument();
0301 if (! matchesDocFeature.isEmpty()) {
0302 if(matchesMap == null){
0303 matchesMap = new HashMap();
0304 }
0305 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
0306 // System.out.println("matchesMap is: " + matchesMap);
0307 //we need to put it even if it was already present in order to triger
0308 //the update events
0309 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
0310
0311 //cannot do clear() as this has already been put on the document
0312 //so I need a new one for the next run of matcher
0313 matchesDocFeature = new ArrayList();
0314
0315
0316 fireStatusChanged("OrthoMatcher completed");
0317 }
0318 }finally{
0319 //make sure the cleanup happens even if there are errors.
0320 // Out.prln("Processed strings" + processedAnnots.values());
0321 //clean-up the internal data structures for next run
0322 nameAllAnnots = null;
0323 processedAnnots.clear();
0324 annots2Remove.clear();
0325 tokensMap.clear();
0326 normalizedTokensMap.clear();
0327 matchesDocFeature = new ArrayList();
0328 longAnnot = null;
0329 shortAnnot = null;
0330 tokensLongAnnot = null;
0331 tokensShortAnnot = null;
0332
0333 //if (log.isDebugEnabled()) OrthoMatcherHelper.saveUsedTable();
0334 }
0335 } // run()
0336
0337 protected void matchNameAnnotations() throws ExecutionException{
0338 // go through all the annotation types
0339 Iterator iterAnnotationTypes = annotationTypes.iterator();
0340 while (iterAnnotationTypes.hasNext()) {
0341 String annotationType = (String)iterAnnotationTypes.next();
0342
0343 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
0344
0345 // continue if no such annotations exist
0346 if (nameAnnots.isEmpty()) continue;
0347
0348 AnnotationSet tokensNameAS = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
0349 if (tokensNameAS.isEmpty()) continue;
0350
0351 ArrayList<Annotation> sortedNameAnnots = new ArrayList<Annotation>(nameAnnots);
0352 Collections.<Annotation>sort(sortedNameAnnots,new OffsetComparator());
0353 for (int snaIndex = 0;snaIndex < sortedNameAnnots.size();snaIndex++) {
0354 Annotation tempAnnot = sortedNameAnnots.get(snaIndex);
0355 Annotation nameAnnot = nameAllAnnots.get(tempAnnot.getId()); // Not sure if this matters
0356 Integer id = nameAnnot.getId();
0357
0358 // get string and value
0359 String annotString = orthoAnnotation.getStringForAnnotation(nameAnnot, document);
0360
0361 //convert to lower case if we are not doing a case sensitive match
0362 if (!caseSensitive)
0363 annotString = annotString.toLowerCase();
0364
0365 if (DEBUG) {
0366 if (log.isDebugEnabled()) {
0367 log.debug("Now processing the annotation: "
0368 + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId()
0369 + " Type: " + nameAnnot.getType() + " Offset: " + nameAnnot.getStartNode().getOffset());
0370 }
0371 }
0372
0373 // get the tokens
0374 List tokens = new ArrayList(tokensNameAS.getContained(nameAnnot.getStartNode().getOffset(),
0375 nameAnnot.getEndNode().getOffset()));
0376
0377 //if no tokens to match, do nothing
0378 if (tokens.isEmpty()) {
0379 if (log.isDebugEnabled()) {
0380 log.debug("Didn't find any tokens for the following annotation. We will be unable to perform coref on this annotation. \n String: "
0381 + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId() + " Type: " + nameAnnot.getType());
0382 }
0383 continue;
0384 }
0385 Collections.sort(tokens, new gate.util.OffsetComparator());
0386 //check if these actually do not end after the name
0387 //needed coz new tokeniser conflates
0388 //strings with dashes. So British Gas-style is two tokens
0389 //instead of three. So cannot match properly British Gas
0390 // tokens = checkTokens(tokens);
0391 tokensMap.put(nameAnnot.getId(), tokens);
0392 normalizedTokensMap.put(nameAnnot.getId(), new ArrayList<Annotation>(tokens));
0393
0394 //first check whether we have not matched such a string already
0395 //if so, just consider it matched, don't bother calling the rules
0396 // Exception: AB, Spock:
0397 // Note that we require one-token Person annotations to be matched even if an identical string
0398 // has been matched earlier because there could be multiple people named "David", for instance,
0399 // on a page.
0400 if (processedAnnots.containsValue(annotString) &&
0401 (! (nameAnnot.getType().equals(personType) && (tokens.size() == 1)))) {
0402 Annotation returnAnnot = orthoAnnotation.updateMatches(nameAnnot, annotString,processedAnnots,nameAllAnnots,matchesDocFeature);
0403 if (returnAnnot != null) {
0404 if (DEBUG) {
0405 if (log.isDebugEnabled()) {
0406 log.debug("Exact match criteria matched " + annotString + " from (id: " + nameAnnot.getId() + ", offset: " + nameAnnot.getStartNode().getOffset() + ") to " +
0407 "(id: " + returnAnnot.getId() + ", offset: " + returnAnnot.getStartNode().getOffset() + ")");
0408 }
0409 }
0410 processedAnnots.put(nameAnnot.getId(), annotString);
0411 continue;
0412 }
0413 } else if (processedAnnots.isEmpty()) {
0414 // System.out.println("First item put in processedAnnots: " + annotString);
0415 processedAnnots.put(nameAnnot.getId(), annotString);
0416 continue;
0417 }
0418
0419 //if a person, then remove their title before matching
0420 if (nameAnnot.getType().equals(personType)) {
0421 annotString = orthoAnnotation.stripPersonTitle(annotString, nameAnnot,document,tokensMap,normalizedTokensMap,nameAllAnnots);
0422 normalizePersonName(nameAnnot);
0423 }
0424 else if (nameAnnot.getType().equals(organizationType))
0425 annotString = normalizeOrganizationName(annotString, nameAnnot);
0426
0427 if(null == annotString || "".equals(annotString) || tokens.isEmpty()) {
0428 if (log.isDebugEnabled()) {
0429 log.debug("Annotation ID " + nameAnnot.getId() + " of type" + nameAnnot.getType() +
0430 " refers to a null or empty string or one with no tokens after normalization. Unable to process further.");
0431 }
0432 continue;
0433 }
0434 //otherwise try matching with previous annotations
0435 matchWithPrevious(nameAnnot, annotString,sortedNameAnnots,snaIndex);
0436
0437 // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
0438 //finally add the current annotations to the processed map
0439 processedAnnots.put(nameAnnot.getId(), annotString);
0440 }//while through name annotations
0441 if (matchingUnknowns) {
0442 matchUnknown(sortedNameAnnots);
0443 }
0444 }//while through annotation types
0445
0446 }
0447
0448 protected void matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType) throws ExecutionException {
0449 //get all Unknown annotations
0450 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
0451 annots2Remove.clear();
0452 if (unknownAnnots.isEmpty()) return;
0453
0454 AnnotationSet nameAllTokens = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
0455 if (nameAllTokens.isEmpty()) return;
0456
0457 Iterator<Annotation> iter = unknownAnnots.iterator();
0458 //loop through the unknown annots
0459 while (iter.hasNext()) {
0460 Annotation unknown = iter.next();
0461
0462 // get string and value
0463 String unknownString = orthoAnnotation.getStringForAnnotation(unknown, document);
0464 //convert to lower case if we are not doing a case sensitive match
0465 if (!caseSensitive)
0466 unknownString = unknownString.toLowerCase();
0467
0468 // System.out.println("Now trying to match the unknown string: " + unknownString);
0469 //get the tokens
0470 List tokens = new ArrayList((Set)
0471 nameAllTokens.getContained(
0472 unknown.getStartNode().getOffset(),
0473 unknown.getEndNode().getOffset()
0474 ));
0475 if (tokens.isEmpty())
0476 continue;
0477 Collections.sort(tokens, new gate.util.OffsetComparator());
0478 tokensMap.put(unknown.getId(), tokens);
0479 normalizedTokensMap.put(unknown.getId(), tokens);
0480
0481
0482 //first check whether we have not matched such a string already
0483 //if so, just consider it matched, don't bother calling the rules
0484 if (processedAnnots.containsValue(unknownString)) {
0485 Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
0486 if (matchedAnnot == null) {
0487 log.info("Orthomatcher: Unable to find the annotation: " +
0488 orthoAnnotation.getStringForAnnotation(unknown, document) +
0489 " in matchUnknown");
0490 }
0491 else {
0492 if (matchedAnnot.getType().equals(unknownType)) {
0493 annots2Remove.put(unknown.getId(),
0494 annots2Remove.get(matchedAnnot.getId()));
0495 }
0496 else
0497 annots2Remove.put(unknown.getId(), matchedAnnot.getType());
0498 processedAnnots.put(unknown.getId(), unknownString);
0499 unknown.getFeatures().put("NMRule", unknownType);
0500 continue;
0501 }
0502 }
0503
0504 //check if we should do sub-string matching in case it's hyphenated
0505 //for example US-led
0506 if (tokens.size() == 1
0507 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
0508 if (matchHyphenatedUnknowns(unknown, unknownString, iter))
0509 continue;
0510 }//if
0511
0512 // TODO: The below results in a assigning the unknown's to the last annotation that it matches in a document.
0513 // It would probably be better to first start with things which precede the current unknown and then do
0514 // annotations after
0515 matchWithPrevious(unknown, unknownString,sortedAnnotationsForAType,sortedAnnotationsForAType.size() - 1);
0516
0517 } //while though unknowns
0518
0519 if (! annots2Remove.isEmpty()) {
0520 Iterator unknownIter = annots2Remove.keySet().iterator();
0521 while (unknownIter.hasNext()) {
0522 Integer unknId = (Integer) unknownIter.next();
0523 Annotation unknown = nameAllAnnots.get(unknId);
0524 Integer newID = nameAllAnnots.add(
0525 unknown.getStartNode(),
0526 unknown.getEndNode(),
0527 (String) annots2Remove.get(unknId),
0528 unknown.getFeatures()
0529 );
0530 nameAllAnnots.remove(unknown);
0531
0532 //change the id in the matches list
0533 List mList = (List)unknown.getFeatures().
0534 get(ANNOTATION_COREF_FEATURE_NAME);
0535 mList.remove(unknId);
0536 mList.add(newID);
0537 }//while
0538 }//if
0539 }
0540
0541 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
0542 Iterator iter){
0543 boolean matched = false;
0544
0545 //only take the substring before the hyphen
0546 int stringEnd = unknownString.indexOf("-");
0547 unknownString = unknownString.substring(0, stringEnd);
0548 //check if we've already matched this string
0549 //because only exact match of the substring are considered
0550 if (processedAnnots.containsValue(unknownString)) {
0551 matched = true;
0552 Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
0553 //only do the matching if not a person, because we do not match
0554 //those on sub-strings
0555 iter.remove();
0556 String newType;
0557 if (matchedAnnot.getType().equals(unknownType))
0558 newType = (String)annots2Remove.get(matchedAnnot.getId());
0559 else
0560 newType = matchedAnnot.getType();
0561
0562 Integer newID = new Integer(-1);
0563 try {
0564 newID = nameAllAnnots.add(
0565 unknown.getStartNode().getOffset(),
0566 new Long(unknown.getStartNode().getOffset().longValue()
0567 + stringEnd),
0568 newType,
0569 unknown.getFeatures()
0570 );
0571 } catch (InvalidOffsetException ex) {
0572 throw new GateRuntimeException(ex.getMessage());
0573 }
0574 nameAllAnnots.remove(unknown);
0575
0576 //change the id in the matches list
0577 List mList = (List)unknown.getFeatures().
0578 get(ANNOTATION_COREF_FEATURE_NAME);
0579 mList.remove(unknown.getId());
0580 mList.add(newID);
0581
0582 }
0583 return matched;
0584 }
0585
0586 /**
0587 * Attempt to match nameAnnot against all previous annotations of the same type, which are passed down
0588 * in listOfThisType. Matches are tested in order from most recent to oldest.
0589 * @param nameAnnot Annotation we are trying to match
0590 * @param annotString Normalized string representation of annotation
0591 * @param listOfThisType ArrayList of Annotations of the same type as nameAnnot
0592 * @param startIndex Index in listOfThisType that we will start from in matching the current annotation
0593 */
0594 protected void matchWithPrevious(Annotation nameAnnot, String annotString,
0595 ArrayList<Annotation> listOfThisType,
0596 int startIndex) {
0597 boolean matchedUnknown = false;
0598 // Out.prln("matchWithPrevious now processing: " + annotString);
0599
0600 for (int curIndex = startIndex - 1;curIndex >= 0;curIndex--) {
0601 Integer prevId = listOfThisType.get(curIndex).getId();
0602 Annotation prevAnnot = nameAllAnnots.get(prevId); // Note that this line probably isn't necessary anymore
0603
0604 //check if the two are from the same type or the new one is unknown
0605 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
0606 && ! nameAnnot.getType().equals(unknownType))
0607 )
0608 continue;
0609 //do not compare two unknown annotations either
0610 //they are only matched to those of known types
0611 if ( nameAnnot.getType().equals(unknownType)
0612 && prevAnnot.getType().equals(unknownType))
0613 continue;
0614
0615 //check if we have already matched this annotation to the new one
0616 if (orthoAnnotation.matchedAlready(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots) )
0617 continue;
0618
0619 //now changed to a rule, here we just match by gender
0620 if (prevAnnot.getType().equals(personType)) {
0621 String prevGender =
0622 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
0623 String nameGender =
0624 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
0625 if ( prevGender != null
0626 && nameGender != null
0627 && ( (nameGender.equalsIgnoreCase("female")
0628 &&
0629 prevGender.equalsIgnoreCase("male")
0630 )
0631 ||
0632 (prevGender.equalsIgnoreCase("female")
0633 && nameGender.equalsIgnoreCase("male")
0634 )
0635 )
0636 ) //if condition
0637 continue; //we don't have a match if the two genders are different
0638
0639 }//if
0640
0641 //if the two annotations match
0642 //
0643 // A. Borthwick, Spock: If the earlier annotation is shorter than the current annotation and it
0644 // has already been matched with a longer annotations, then don't match it with the current annotation.
0645 // Reasoning is that with the sequence David Jones . . . David . . . David Smith, we don't want to match
0646 // David Smith with David. However, with the sequence, David . . . David Jones, it's okay to match the
0647 // shorter version with the longer, because it hasn't already been matched with a longer.
0648 boolean prevAnnotUsedToMatchWithLonger = prevAnnot.getFeatures().containsKey("matchedWithLonger");
0649 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) {
0650 orthoAnnotation.updateMatches(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots);
0651 if (DEBUG) {
0652 log.debug("Just matched nameAnnot " + nameAnnot.getId() + " with prevAnnot " + prevAnnot.getId());
0653 }
0654
0655 if (!prevAnnotUsedToMatchWithLonger && prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
0656 // We have just matched the previous annotation with a longer annotation for the first time. We need
0657 // to propagate the matchedWithLonger property to all other annotations which coreffed with the previous annotation
0658 // so that we don't match them with a longer annotation
0659 propagatePropertyToExactMatchingMatches(prevAnnot,"matchedWithLonger",true);
0660 }
0661 //if unknown annotation, we need to change to the new type
0662 if (nameAnnot.getType().equals(unknownType)) {
0663 matchedUnknown = true;
0664 if (prevAnnot.getType().equals(unknownType))
0665 annots2Remove.put(nameAnnot.getId(),
0666 annots2Remove.get(prevAnnot.getId()));
0667 else
0668 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
0669 //also put an attribute to indicate that
0670 nameAnnot.getFeatures().put("NMRule", unknownType);
0671 }//if unknown
0672 break; //no need to match further
0673 }//if annotations matched
0674
0675 }//while through previous annotations
0676
0677 if (matchedUnknown)
0678 processedAnnots.put(nameAnnot.getId(), annotString);
0679
0680
0681 }//matchWithPrevious
0682
0683 protected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot,String featureName,Object value) {
0684 try {
0685 List<Integer> matchesList = (List<Integer>) updateAnnot.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
0686 if ((matchesList == null) || matchesList.isEmpty()) {
0687 return;
0688 }
0689 else {
0690 String updateAnnotString = orthoAnnotation.getStringForAnnotation(updateAnnot, document).toLowerCase();
0691 for (Integer nextId : matchesList) {
0692 Annotation a = nameAllAnnots.get(nextId);
0693
0694 if (orthoAnnotation.fuzzyMatch(orthoAnnotation.getStringForAnnotation(a, document),updateAnnotString)) {
0695 if (DEBUG) {
0696 log.debug("propogateProperty: " + featureName + " " + value + " from: " + updateAnnot.getId() + " to: " + a.getId());
0697 }
0698 a.getFeatures().put(featureName, value);
0699 }
0700 }
0701 }
0702 }
0703 catch (Exception e) {
0704 log.error("Error in propogatePropertyToExactMatchingMatches", e);
0705 }
0706 }
0707
0708 protected boolean matchAnnotations(Annotation newAnnot, String annotString,
0709 Annotation prevAnnot) {
0710 //do not match two annotations that overlap
0711 if (newAnnot.overlaps(prevAnnot))
0712 return false;
0713
0714 // find which annotation string of the two is longer
0715 // this is useful for some of the matching rules
0716 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
0717 // Out.prln("matchAnnotations processing " + annotString + " and " + prevAnnotString);
0718 if (prevAnnotString == null) {
0719 // Out.prln("We discovered that the following string is null!: " + prevAnnot.getId() +
0720 // " For the previous annotation " + getStringForAnnotation(prevAnnot, document) +
0721 // " which has annotation type " + prevAnnot.getType() +
0722 // " Tried to compared it to the annotation string " + annotString);
0723 return false;
0724 }
0725
0726 String longName = prevAnnotString;
0727 String shortName = annotString;
0728 longAnnot = prevAnnot;
0729 shortAnnot = newAnnot;
0730 boolean longerPrevious = true;
0731
0732 if (shortName.length()>longName.length()) {
0733 String temp = longName;
0734 longName = shortName;
0735 shortName = temp;
0736 Annotation tempAnn = longAnnot;
0737 longAnnot = shortAnnot;
0738 shortAnnot = tempAnn;
0739 longerPrevious = false;
0740 }//if
0741
0742 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
0743 normalizedTokensLongAnnot = (ArrayList) normalizedTokensMap.get(longAnnot.getId());
0744 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
0745 normalizedTokensShortAnnot = (ArrayList) normalizedTokensMap.get(shortAnnot.getId());
0746
0747 List matchesList = (List) prevAnnot.getFeatures().
0748 get(ANNOTATION_COREF_FEATURE_NAME);
0749 if (matchesList == null || matchesList.isEmpty())
0750 return apply_rules_namematch(prevAnnot.getType(), shortName,longName,
0751 prevAnnot,newAnnot,longerPrevious);
0752
0753 //if these two match, then let's see if all the other matching one will too
0754 //that's needed, because sometimes names can share a token (e.g., first or
0755 //last but not be the same
0756 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
0757 longerPrevious)) {
0758 /**
0759 * Check whether we need to ensure that there is a match with the rest
0760 * of the matching annotations, because the rule requires that
0761 * transtivity is not assummed.
0762 */
0763 if (allMatchingNeeded) {
0764 allMatchingNeeded = false;
0765
0766 List toMatchList = new ArrayList(matchesList);
0767 // if (newAnnot.getType().equals(unknownType))
0768 // Out.prln("Matching new " + annotString + " with annots " + toMatchList);
0769 toMatchList.remove(prevAnnot.getId());
0770
0771 return matchOtherAnnots(toMatchList, newAnnot, annotString);
0772 } else
0773 return true;
0774 }
0775 return false;
0776 }
0777
0778 /** This method checkes whether the new annotation matches
0779 * all annotations given in the toMatchList (it contains ids)
0780 * The idea is that the new annotation needs to match all those,
0781 * because assuming transitivity does not always work, when
0782 * two different entities share a common token: e.g., BT Cellnet
0783 * and BT and British Telecom.
0784 */
0785 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
0786 String annotString) {
0787
0788 //if the list is empty, then we're matching all right :-)
0789 if (toMatchList.isEmpty())
0790 return true;
0791
0792 boolean matchedAll = true;
0793 int i = 0;
0794
0795 while (matchedAll && i < toMatchList.size()) {
0796 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
0797
0798 // find which annotation string of the two is longer
0799 // this is useful for some of the matching rules
0800 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
0801 if (prevAnnotString == null)
0802 try {
0803 prevAnnotString = document.getContent().getContent(
0804 prevAnnot.getStartNode().getOffset(),
0805 prevAnnot.getEndNode().getOffset()
0806 ).toString();
0807 } catch (InvalidOffsetException ioe) {
0808 return false;
0809 }//try
0810
0811
0812 String longName = prevAnnotString;
0813 String shortName = annotString;
0814 longAnnot = prevAnnot;
0815 shortAnnot = newAnnot;
0816 boolean longerPrevious = true;
0817 if (shortName.length()>=longName.length()) {
0818 String temp = longName;
0819 longName = shortName;
0820 shortName = temp;
0821 Annotation tempAnn = longAnnot;
0822 longAnnot = shortAnnot;
0823 shortAnnot = tempAnn;
0824 longerPrevious = false;
0825 }//if
0826
0827 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
0828 normalizedTokensLongAnnot = (ArrayList) normalizedTokensMap.get(longAnnot.getId());
0829 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
0830 normalizedTokensShortAnnot = (ArrayList) normalizedTokensMap.get(shortAnnot.getId());
0831
0832 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
0833 longerPrevious);
0834 // if (newAnnot.getType().equals(unknownType))
0835 // Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
0836
0837 i++;
0838 }//while
0839 return matchedAll;
0840 }
0841
0842 protected void docCleanup() {
0843 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
0844 if (matchesValue != null && (matchesValue instanceof Map))
0845 ((Map)matchesValue).remove(nameAllAnnots.getName());
0846 else if (matchesValue != null) {
0847 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
0848 }
0849
0850 //get all annotations that have a matches feature
0851 HashSet fNames = new HashSet();
0852 fNames.add(ANNOTATION_COREF_FEATURE_NAME);
0853 AnnotationSet annots =
0854 nameAllAnnots.get(null, fNames);
0855
0856 // Out.prln("Annots to cleanup" + annots);
0857
0858 if (annots == null || annots.isEmpty())
0859 return;
0860
0861 Iterator<Annotation> iter = annots.iterator();
0862 while (iter.hasNext()) {
0863 while (iter.hasNext())
0864 iter.next().getFeatures().remove(ANNOTATION_COREF_FEATURE_NAME);
0865 } //while
0866 }//cleanup
0867
0868
0869 static Pattern periodPat = Pattern.compile("[\\.]+");
0870
0871 protected void normalizePersonName (Annotation annot) throws ExecutionException {
0872 ArrayList<Annotation> tokens = (ArrayList) normalizedTokensMap.get(annot.getId());
0873 for (int i = tokens.size() - 1; i >= 0; i--) {
0874 String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
0875 String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
0876 String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
0877 if (!caseSensitive) {
0878 tokenString = tokenString.toLowerCase();
0879 }
0880 // log.debug("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
0881 if (kind.equals(PUNCTUATION_VALUE) ) {
0882 // log.debug("Now tagging it!");
0883 tokens.get(i).getFeatures().put("ortho_stop", true);
0884 }
0885 }
0886
0887 ArrayList<Annotation> normalizedTokens = new ArrayList<Annotation>(tokens);
0888 for (int j = normalizedTokens.size() - 1; j >= 0;j--) {
0889 if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
0890 // log.debug("Now removing " + normalizedTokens.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
0891 normalizedTokens.remove(j);
0892 }
0893 }
0894 // log.debug("normalizedTokens size is: " + normalizedTokens.size());
0895 normalizedTokensMap.put(annot.getId(), normalizedTokens);
0896 }
0897
0898 /** return an organization without a designator and starting The*/
0899 protected String normalizeOrganizationName (String annotString, Annotation annot){
0900
0901 ArrayList<Annotation> tokens = (ArrayList) tokensMap.get(annot.getId());
0902
0903 //strip starting The first
0904 if ( ((String) ((Annotation) tokens.get(0)
0905 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
0906 .equalsIgnoreCase(THE_VALUE))
0907 tokens.remove(0);
0908
0909 if (tokens.size() > 0) {
0910
0911 // New code by A. Borthwick of Spock Networks
0912 // June 13, 2008
0913 // Strip everything on the cdg list, which now encompasses not just cdg's, but also other stopwords
0914 // Start from the right side so we don't mess up the arraylist
0915 for (int i = tokens.size() - 1; i >= 0; i--) {
0916 String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
0917 String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
0918 String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
0919 if (!caseSensitive) {
0920 tokenString = tokenString.toLowerCase();
0921 }
0922 // Out.prln("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
0923 if (kind.equals(PUNCTUATION_VALUE) ||
0924 ( (category != null) && (category.equals("DT") || category.equals("IN")) )
0925 || cdg.contains(tokenString)) {
0926 // Out.prln("Now tagging it!");
0927 tokens.get(i).getFeatures().put("ortho_stop", true);
0928 }
0929 }
0930
0931 // AB, Spock: Need to check for CDG even for 1 token so we don't automatically match
0932 // a one-token annotation called "Company", for instance
0933 String compareString = (String) tokens.get(tokens.size()-1).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
0934 if (!caseSensitive) {
0935 compareString = compareString.toLowerCase();
0936 }
0937 if (cdg.contains(compareString)) {
0938 tokens.remove(tokens.size()-1);
0939 }
0940
0941 }
0942
0943 ArrayList<Annotation> normalizedTokens = new ArrayList<Annotation>(tokens);
0944 for (int j = normalizedTokens.size() - 1; j >= 0;j--) {
0945 if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
0946 normalizedTokens.remove(j);
0947 }
0948 }
0949
0950 normalizedTokensMap.put(annot.getId(), normalizedTokens);
0951
0952 StringBuffer newString = new StringBuffer(50);
0953 for (int i = 0; i < tokens.size(); i++){
0954 newString.append((String) ((Annotation) tokens.get(i)
0955 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
0956 if (i != tokens.size()-1)
0957 newString.append(" ");
0958 }
0959 // Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
0960
0961 if (caseSensitive)
0962 return newString.toString();
0963
0964 return newString.toString().toLowerCase();
0965 }
0966
0967 /** creates the lookup tables */
0968 protected void createAnnotList(String nameFile,String nameList)
0969 throws IOException{
0970 //create the relative URL
0971 URL fileURL = new URL(definitionFileURL, nameFile);
0972 BufferedReader bufferedReader =
0973 new BomStrippingInputStreamReader(fileURL.openStream(),
0974 encoding);
0975
0976 String lineRead = null;
0977 while ((lineRead = bufferedReader.readLine()) != null){
0978 if (nameList.compareTo(CDGLISTNAME)==0){
0979 Matcher matcher = punctPat.matcher(lineRead.toLowerCase().trim());
0980 lineRead = matcher.replaceAll(" ").trim();
0981 if (caseSensitive)
0982 cdg.add(lineRead);
0983 else
0984 cdg.add(lineRead.toLowerCase());
0985 }// if
0986 else {
0987 int index = lineRead.indexOf("£");
0988 if (index != -1){
0989 String expr = lineRead.substring(0,index);
0990 //if not case-sensitive, we need to downcase all strings
0991 if (!caseSensitive)
0992 expr = expr.toLowerCase();
0993 String code = lineRead.substring(index+1,lineRead.length());
0994 if (nameList.equals(ALIASLISTNAME))
0995 alias.put(expr, code);
0996 else
0997 if (nameList.equals(ARTLISTNAME))
0998 def_art.put(expr, code);
0999 else
1000 if (nameList.equals(PREPLISTNAME))
1001 prepos.put(expr, code);
1002 else
1003 if (nameList.equals(CONNECTORLISTNAME))
1004 connector.put(expr, code);
1005 else
1006 if (nameList.equals(SPURLISTNAME))
1007 spur_match.put(expr, code);
1008
1009 }//if
1010 }// else
1011
1012 }//while
1013 }//createAnnotList
1014
1015
1016 /**
1017 * This is the skeleton of a function which should be available in OrthoMatcher to allow a pairwise comparison of two name strings
1018 * It should eventually be made public. It is private here (and thus non-functional) because OrthoMatcher is currently reliant
1019 * on the tokenization of the names, which are held in the global variables tokensShortAnnot and tokensLongAnnot
1020 *
1021 * @param name1
1022 * @param name2
1023 * @return true if the two names indicate the same person
1024 */
1025 private boolean pairwise_person_name_match(String name1, String name2) {
1026 String shortName,longName;
1027 if (name1.length() > name2.length()) {
1028 longName = name1;
1029 shortName = name2;
1030 }
1031 else {
1032 longName = name2;
1033 shortName = name1;
1034 }
1035 if (rules.get(0).value(longName,shortName)) {//matchRule0(longName,shortName)
1036 return false;
1037 }
1038 else {
1039 if (longName.equals(shortName) || rules.get(2).value(longName, shortName) ||
1040 rules.get(3).value(longName, shortName)) {
1041 return true;
1042 }
1043 else {
1044 return (rules.get(0).value(longName, shortName));
1045 // boolean throwAway[] = new boolean[17];
1046 // return basic_person_match_criteria(shortName,longName,throwAway);
1047 // The above doesn't work because basic_person_match_criteria is reliant on the global
1048 // variables tokensShortAnnot and tokensLongAnnot so I just call what I can directly
1049 }
1050 }
1051 }
1052
1053 /**
1054 * basic_person_match_criteria
1055 * Note that this function relies on various global variables in some other match rules.
1056 * @param shortName
1057 * @param longName
1058 * @param mr
1059 * @return
1060 */
1061 private boolean basic_person_match_criteria(String shortName,
1062 String longName, boolean mr[]) {
1063
1064 if ( // For 4, 5, 14, and 15, need to mark shorter annot
1065 //kalina: added 16, so it matches names when contain more than one first and one last name
1066 OrthoMatcherHelper.executeDisjunction(rules, new int[] {1,5,6,13,15,16},longName,shortName,mr)
1067 ) {
1068 return true;
1069 }
1070 return false;
1071 }
1072
1073
1074 /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
1075 private boolean apply_rules_namematch(String annotationType, String shortName,
1076 String longName,Annotation prevAnnot,
1077 Annotation followAnnot,
1078 boolean longerPrevious) {
1079 boolean mr[] = new boolean[rules.size()];
1080 // first apply rule for spurious matches i.e. rule0
1081 if (DEBUG) {
1082 log.debug("Now matching " + longName + "(id: " + longAnnot.getId() + ") to "
1083 + shortName + "(id: " + shortAnnot.getId() + ")");
1084 }
1085
1086 if (rules.get(0).value(longName,shortName))
1087 return false;
1088 if (
1089 (// rules for all annotations
1090 //no longer use rule1, coz I do the check for same string via the hash table
1091 OrthoMatcherHelper.executeDisjunction(rules, new int[] {2,3},longName,shortName,mr)
1092
1093 ) // rules for all annotations
1094 ||
1095 (// rules for organisation annotations
1096 (annotationType.equals(organizationType)
1097 //ACE addition
1098 || annotationType.equals("Facility")
1099 )
1100 &&
1101 // Should basically only match when you have a match of all tokens other than
1102 // CDG's and function words
1103 (
1104 (!highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {4,6,7,8,9,10,11,12,14},longName,shortName,mr))
1105 ||
1106 (highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {7,8,10,11,17},longName,shortName,mr))
1107 )
1108 )
1109 ) {// rules for organisation annotations
1110 return true;
1111 }
1112
1113 if (// rules for person annotations
1114 ( annotationType.equals(personType))) {
1115 if (noMatchRule1(longName, shortName,prevAnnot, longerPrevious) ||
1116 noMatchRule2(longName, shortName)) {
1117 // Out.prln("noMatchRule1 rejected match between " + longName + " and " + shortName);
1118 return false;
1119 }
1120 else {
1121 if ( basic_person_match_criteria(shortName,longName,mr))
1122 {
1123 if ((longName.length() != shortName.length()) && (mr[4] || mr[5] || mr[14] || mr[15])) {
1124 if (longerPrevious) {
1125 followAnnot.getFeatures().put("matchedWithLonger", true);
1126 }
1127 else {
1128 prevAnnot.getFeatures().put("matchedWithLonger", true);
1129 }
1130 }
1131 else if ((longName.length() == shortName.length()) && (mr[1])) {
1132 if (prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
1133 followAnnot.getFeatures().put("matchedWithLonger", true);
1134 }
1135 }
1136 return true;
1137 }
1138 return false;
1139 }
1140 }
1141 return false;
1142 }//apply_rules
1143
1144
1145 /** set the extLists flag */
1146 public void setExtLists(Boolean newExtLists) {
1147 extLists = newExtLists.booleanValue();
1148 }//setextLists
1149
1150 /** set the caseSensitive flag */
1151 public void setCaseSensitive(Boolean newCase) {
1152 caseSensitive = newCase.booleanValue();
1153 }//setextLists
1154
1155 /** set the annotation set name*/
1156 public void setAnnotationSetName(String newAnnotationSetName) {
1157 annotationSetName = newAnnotationSetName;
1158 }//setAnnotationSetName
1159
1160 /** set the types of the annotations*/
1161 public void setAnnotationTypes(List newType) {
1162 annotationTypes = newType;
1163 }//setAnnotationTypes
1164
1165 /** set whether to process the Unknown annotations*/
1166 public void setProcessUnknown(Boolean processOrNot) {
1167 this.matchingUnknowns = processOrNot.booleanValue();
1168 }//setAnnotationTypes
1169
1170 public void setOrganizationType(String newOrganizationType) {
1171 organizationType = newOrganizationType;
1172 }//setOrganizationType
1173
1174 public void setPersonType(String newPersonType) {
1175 personType = newPersonType;
1176 }//setPersonType
1177
1178 /**get the name of the annotation set*/
1179 public String getAnnotationSetName() {
1180 return annotationSetName;
1181 }//getAnnotationSetName
1182
1183 /** get the types of the annotation*/
1184 public List getAnnotationTypes() {
1185 return annotationTypes;
1186 }//getAnnotationTypes
1187
1188 public String getOrganizationType() {
1189 return organizationType;
1190 }
1191
1192 public String getPersonType() {
1193 return personType;
1194 }
1195
1196 public Boolean getExtLists() {
1197 return new Boolean(extLists);
1198 }
1199
1200 /** Are we running in a case-sensitive mode?*/
1201 public Boolean getCaseSensitive() {
1202 return new Boolean(caseSensitive);
1203 }
1204
1205 /** Return whether or not we're processing the Unknown annots*/
1206 public Boolean getProcessUnknown() {
1207 return new Boolean(matchingUnknowns);
1208 }
1209
1210
1211
1212 /**
1213 No Match Rule 1:
1214 Avoids the problem of matching
1215 David Jones ...
1216 David ...
1217 David Smith
1218 Since "David" was matched with David Jones, we don't match David with David Smith.
1219 */
1220 public boolean noMatchRule1(String s1,
1221 String s2,Annotation previousAnnot, boolean longerPrevious) {
1222 // if (DEBUG) {
1223 // try {
1224 // String annotString = getStringForAnnotation(previousAnnot, document );
1225
1226 // log.debug("Previous annotation was " + annotString + "(id: " + previousAnnot.getId() + ")" + " features are " + previousAnnot.getFeatures());
1227 // }
1228 // catch (ExecutionException e) {}
1229 // }
1230
1231 if (longerPrevious || !previousAnnot.getFeatures().containsKey("matchedWithLonger")) {
1232 return false;
1233 }
1234 else {
1235 return true;
1236 }
1237 }//noMatchRule1
1238
1239 /***
1240 * returns true if it detects a middle name which indicates that the name string contains a nickname or a
1241 * compound last name
1242 */
1243 private boolean detectBadMiddleTokens(ArrayList<Annotation> tokArray) {
1244 for (int j = 1;j < tokArray.size() - 1;j++) {
1245 String currentToken = (String) tokArray.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1246 Matcher matcher = badMiddleTokens.matcher(currentToken.toLowerCase().trim());
1247 if (matcher.find()) {
1248 // We have found a case of a ", ',
1249 return true;
1250 }
1251 }
1252 return false;
1253 }
1254
1255 /**
1256 * NoMatch Rule #2: Do we have a mismatch of middle initial?
1257 * Condition(s): Only applies to person names with more than two tokens in the name
1258 *
1259 * Want George W. Bush != George H. W. Bush and George Walker Bush != George Herbert Walker Bush
1260 * and
1261 * John T. Smith != John Q. Smith
1262 * however
1263 * John T. Smith == John Thomas Smith
1264 * be careful about
1265 * Hillary Rodham Clinton == Hillary Rodham-Clinton
1266 * be careful about
1267 * Carlos Bueno de Lopez == Bueno de Lopez
1268 * and
1269 * Cynthia Morgan de Rothschild == Cynthia de Rothschild
1270 */
1271 public boolean noMatchRule2(String s1,String s2) {
1272 if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2) {
1273 boolean retval = false;
1274 if (normalizedTokensLongAnnot.size() != normalizedTokensShortAnnot.size()) {
1275 String firstNameLong = (String) normalizedTokensLongAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1276 String firstNameShort = (String) normalizedTokensShortAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1277 String lastNameLong = (String) normalizedTokensLongAnnot.get(normalizedTokensLongAnnot.size() - 1).
1278 getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1279 String lastNameShort = (String) normalizedTokensShortAnnot.get(normalizedTokensShortAnnot.size() - 1).
1280 getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1281 if (rules.get(1).value(firstNameLong,firstNameShort) &&
1282 (rules.get(1).value(lastNameLong,lastNameShort))) {
1283 // Must have a match on first and last name for this non-match rule to take effect when the number of tokens differs
1284 if (detectBadMiddleTokens(tokensLongAnnot) || detectBadMiddleTokens(tokensShortAnnot)) {
1285 // Exclude the William (Bill) H. Gates vs. William H. Gates case and the
1286 // Cynthia Morgan de Rothschild vs. Cynthia de Rothschild case
1287 if (DEBUG && log.isDebugEnabled()) {
1288 log.debug("noMatchRule2Name did not non-match because of bad middle tokens " + s1 + "(id: " + longAnnot.getId() + ") to "
1289 + s2+ "(id: " + shortAnnot.getId() + ")");
1290 }
1291 return false;
1292 }
1293 else {
1294 // Covers the George W. Bush vs George H. W. Bush and George Walker Bush vs. George Herbert Walker Bush cases
1295 retval = true;
1296 }
1297 }
1298 }
1299 else {
1300 for (int i = 1; i < normalizedTokensLongAnnot.size() - 1;i++) {
1301 String s1_middle = (String) ((Annotation) normalizedTokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1302 String s2_middle = (String) ((Annotation) normalizedTokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1303 if (!caseSensitive) {
1304 s1_middle = s1_middle.toLowerCase();
1305 s2_middle = s2_middle.toLowerCase();
1306 }
1307 // log.debug("noMatchRule2 comparing substring " + s1_middle + " to " + s2_middle);
1308 if (!(rules.get(1).value(s1_middle,s2_middle) ||
1309 OrthoMatcherHelper.initialMatch(s1_middle, s2_middle))) {
1310 // We found a mismatching middle name
1311 retval = true;
1312 break;
1313 }
1314 }
1315 }
1316 if (retval && log.isDebugEnabled() && DEBUG) {
1317 log.debug("noMatchRule2Name non-matched " + s1 + "(id: " + longAnnot.getId() + ") to "
1318 + s2+ "(id: " + shortAnnot.getId() + ")");
1319 }
1320 return retval;
1321 } // if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2)
1322 return false;
1323 }//noMatchRule2
1324
1325 public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1326 this.definitionFileURL = definitionFileURL;
1327 }
1328
1329 public java.net.URL getDefinitionFileURL() {
1330 return definitionFileURL;
1331 }
1332 public void setEncoding(String encoding) {
1333 this.encoding = encoding;
1334 }
1335 public String getEncoding() {
1336 return encoding;
1337 }
1338
1339
1340 public Double getMinimumNicknameLikelihood() {
1341 return minimumNicknameLikelihood;
1342 }
1343
1344 public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood) {
1345 this.minimumNicknameLikelihood = minimumNicknameLikelihood;
1346 }
1347
1348 /**
1349 * @return the highPrecisionOrgs
1350 */
1351 public Boolean getHighPrecisionOrgs() {
1352 return highPrecisionOrgs;
1353 }
1354
1355 /**
1356 * @param highPrecisionOrgs the highPrecisionOrgs to set
1357 */
1358 public void setHighPrecisionOrgs(Boolean highPrecisionOrgs) {
1359 this.highPrecisionOrgs = highPrecisionOrgs;
1360 }
1361
1362 public void setOrthography(AnnotationOrthography orthography) {
1363 this.orthoAnnotation = orthography;
1364 }
1365
1366 public AnnotationOrthography getOrthography() {
1367 return orthoAnnotation;
1368 }
1369
1370 static Pattern punctPat = Pattern.compile("[\\p{Punct}]+");
1371 // The UTF characters are right and left double and single curly quotes
1372 static Pattern badMiddleTokens = Pattern.compile("[\u201c\u201d\u2018\u2019\'\\(\\)\"]+|^de$|^von$");
1373 }
|