001 /*
002 * NominalCoref.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * $Id: NominalCoref.java 12006 2009-12-01 17:24:28Z thomas_heitz $
013 */
014
015 package gate.creole.coref;
016
017 import java.util.*;
018
019 import gate.*;
020 import gate.creole.*;
021 import gate.util.*;
022
023 public class NominalCoref extends AbstractCoreferencer
024 implements ProcessingResource, ANNIEConstants {
025
026 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
027
028 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
029
030 /** --- */
031 private static final boolean DEBUG = false;
032
033 //annotation features
034 private static final String PERSON_CATEGORY = "Person";
035 private static final String JOBTITLE_CATEGORY = "JobTitle";
036 private static final String ORGANIZATION_CATEGORY = "Organization";
037 private static final String LOOKUP_CATEGORY = "Lookup";
038 private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun";
039
040
041 //scope
042 /** --- */
043 //private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
044 /** --- */
045 private String annotationSetName;
046 /** --- */
047 private AnnotationSet defaultAnnotations;
048 /** --- */
049 private HashMap anaphor2antecedent;
050
051 /* static {
052 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
053 }*/
054
055 /** --- */
056 public NominalCoref() {
057 super("NOMINAL");
058 this.anaphor2antecedent = new HashMap();
059 }
060
061 /** Initialise this resource, and return it. */
062 public Resource init() throws ResourceInstantiationException {
063 return super.init();
064 } // init()
065
066 /**
067 * Reinitialises the processing resource. After calling this method the
068 * resource should be in the state it is after calling init.
069 * If the resource depends on external resources (such as rules files) then
070 * the resource will re-read those resources. If the data used to create
071 * the resource has changed since the resource has been created then the
072 * resource will change too after calling reInit().
073 */
074 public void reInit() throws ResourceInstantiationException {
075 this.anaphor2antecedent = new HashMap();
076 init();
077 } // reInit()
078
079
080 /** Set the document to run on. */
081 public void setDocument(Document newDocument) {
082
083 //0. precondition
084 // Assert.assertNotNull(newDocument);
085
086 super.setDocument(newDocument);
087 }
088
089 /** --- */
090 public void setAnnotationSetName(String annotationSetName) {
091 this.annotationSetName = annotationSetName;
092 }
093
094 /** --- */
095 public String getAnnotationSetName() {
096 return annotationSetName;
097 }
098
099 /**
100 * This method runs the coreferencer. It assumes that all the needed parameters
101 * are set. If they are not, an exception will be fired.
102 *
103 * The process goes like this:
104 * - Create a sorted list of Person and JobTitle annotations.
105 * - Loop through the annotations
106 * If it is a Person, we add it to the top of a stack.
107 * If it is a job title, we subject it to a series of tests. If it
108 * passes, we associate it with the Person annotation at the top
109 * of the stack
110 */
111 public void execute() throws ExecutionException{
112
113 HashMap anaphorToAntecedent = new HashMap();
114 Annotation[] nominalArray;
115
116 //0. preconditions
117 if (null == this.document) {
118 throw new ExecutionException("[coreference] Document is not set!");
119 }
120
121 //1. preprocess
122 preprocess();
123
124 // Out.println("Total annotations: " + defaultAnnotations.size());
125
126 // Get a sorted array of Tokens.
127 // The tests for job titles often require getting previous and subsequent
128 // tokens, so to save work, we create a single, sorted list of
129 // tokens.
130 Annotation[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).
131 toArray(new Annotation[0]);
132 java.util.Arrays.sort(tokens, new OffsetComparator());
133
134 // The current token is the token at the start of the current annotation.
135 int currentToken = 0;
136
137 // get Person entities
138 //FeatureMap personConstraint = new SimpleFeatureMapImpl();
139 //personConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
140 // PERSON_CATEGORY);
141 HashSet personConstraint = new HashSet();
142 personConstraint.add(PERSON_CATEGORY);
143 AnnotationSet people =
144 this.defaultAnnotations.get(personConstraint);
145
146 // get all JobTitle entities
147 //FeatureMap constraintJobTitle = new SimpleFeatureMapImpl();
148 //constraintJobTitle.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, JOBTITLE_CATEGORY);
149 HashSet jobTitleConstraint = new HashSet();
150 jobTitleConstraint.add(JOBTITLE_CATEGORY);
151
152 AnnotationSet jobTitles =
153 this.defaultAnnotations.get(jobTitleConstraint);
154
155 FeatureMap orgNounConstraint = new SimpleFeatureMapImpl();
156 orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
157 ORGANIZATION_NOUN_CATEGORY);
158 AnnotationSet orgNouns =
159 this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint);
160
161 HashSet orgConstraint = new HashSet();
162 orgConstraint.add(ORGANIZATION_CATEGORY);
163
164 AnnotationSet organizations =
165 this.defaultAnnotations.get(orgConstraint);
166
167 // combine them into a list of nominals
168 Set<Annotation> nominals = new HashSet();
169 if (people != null) {
170 nominals.addAll(people);
171 }
172 if (jobTitles != null) {
173 nominals.addAll(jobTitles);
174 }
175 if (orgNouns != null) {
176 nominals.addAll(orgNouns);
177 }
178 if (organizations != null) {
179 nominals.addAll(organizations);
180 }
181
182 // Out.println("total nominals: " + nominals.size());
183
184 // sort them according to offset
185 nominalArray = nominals.toArray(new Annotation[0]);
186 java.util.Arrays.sort(nominalArray, new OffsetComparator());
187
188 ArrayList<Annotation> previousPeople = new ArrayList<Annotation>();
189 ArrayList<Annotation> previousOrgs = new ArrayList<Annotation>();
190
191
192 // process all nominals
193 for (int i=0; i<nominalArray.length; i++) {
194 Annotation nominal = (Annotation)nominalArray[i];
195
196 // Find the current place in the tokens array
197 currentToken = advanceTokenPosition(nominal, currentToken, tokens);
198
199 //Out.print("processing nominal [" + stringValue(nominal) + "] ");
200
201 if (nominal.getType().equals(PERSON_CATEGORY)) {
202 // Add each Person entity to the beginning of the people list
203 // but don't add pronouns
204 Object[] personTokens = getSortedTokens(nominal);
205
206 if (personTokens.length == 1) {
207 Annotation personToken = (Annotation) personTokens[0];
208
209 String personCategory = (String)
210 personToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
211 if (personCategory.equals("PP") ||
212 personCategory.equals("PRP") ||
213 personCategory.equals("PRP$") ||
214 personCategory.equals("PRPR$")) {
215 //Out.println("ignoring personal pronoun");
216 continue;
217 }
218 }
219
220 previousPeople.add(0, nominal);
221 //Out.println("added person");
222 }
223 else if (nominal.getType().equals(JOBTITLE_CATEGORY)) {
224
225 // Look into the tokens to get some info about POS.
226 Object[] jobTitleTokens = getSortedTokens(nominal);
227
228 Annotation lastToken = (Annotation)
229 jobTitleTokens[jobTitleTokens.length - 1];
230
231 // Don't associate if the job title is not a singular noun
232 String tokenCategory = (String)
233 lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
234 // UNCOMMENT FOR SINGULAR PROPER NOUNS (The President, the Pope)
235 //if (! tokenCategory.equals("NN") &&
236 //! tokenCategory.equals("NNP")) {
237 if (! tokenCategory.equals("NN")) {
238 // Out.println("Not a singular noun");
239 continue;
240 }
241
242 // Don't associate it if it's part of a Person (eg President Bush)
243 if (overlapsAnnotations(nominal, people)) {
244 //Out.println("overlapping annotation");
245 continue;
246 }
247
248 Annotation previousToken;
249 String previousValue;
250
251 // Don't associate it if it's proceeded by a generic marker
252 if (currentToken != 0) {
253 previousToken = (Annotation) tokens[currentToken - 1];
254 previousValue = (String)
255 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
256 if (previousValue.equalsIgnoreCase("a") ||
257 previousValue.equalsIgnoreCase("an") ||
258 previousValue.equalsIgnoreCase("other") ||
259 previousValue.equalsIgnoreCase("another")) {
260 //Out.println("indefinite");
261 continue;
262 }
263 }
264
265 // nominals immediately followed by Person annotations:
266 // BAD:
267 // Chairman Bill Gates (title)
268 // GOOD:
269 // secretary of state, Colin Powell (inverted appositive)
270 // the home secretary David Blunkett (same but no comma,
271 // possible in transcriptions)
272 // "the" is a good indicator for apposition
273
274 // Luckily we have an array of all Person annotations in order...
275 if (i < nominalArray.length - 1) {
276 Annotation nextAnnotation = (Annotation) nominalArray[i+1];
277 if (nextAnnotation.getType().equals(PERSON_CATEGORY)) {
278 // is it preceded by a definite article?
279 previousToken = (Annotation) tokens[currentToken - 1];
280 previousValue = (String)
281 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
282
283 // Get all tokens between this and the next person
284 int interveningTokens =
285 countInterveningTokens(nominal, nextAnnotation,
286 currentToken, tokens);
287 if (interveningTokens == 0 &&
288 ! previousValue.equalsIgnoreCase("the")) {
289
290 // There is nothing between the job title and the person,
291 // like "Chairman Gates" -- do nothing.
292 //Out.println("immediately followed by Person");
293 continue;
294 }
295 else if (interveningTokens == 1) {
296 String tokenString =
297 (String) getFollowingToken(nominal,
298 currentToken, tokens)
299 .getFeatures().get(TOKEN_STRING_FEATURE_NAME);
300 //Out.print("STRING VALUE [" + tokenString + "] ");
301 if (! tokenString.equals(",") &&
302 ! tokenString.equals("-")) {
303 //Out.println("nominal and person separated by NOT [,-]");
304 continue;
305 }
306 }
307
308 // Did we get through all that? Then we must have an
309 // apposition.
310
311 anaphor2antecedent.put(nominal, nextAnnotation);
312 //Out.println("associating with " +
313 // stringValue(nextAnnotation));
314 continue;
315
316 }
317 }
318
319 // If we have no possible antecedents, create a new Person
320 // annotation.
321 if (previousPeople.size() == 0) {
322 FeatureMap personFeatures = new SimpleFeatureMapImpl();
323 personFeatures.put("ENTITY_MENTION_TYPE", "NOMINAL");
324 this.defaultAnnotations.add(nominal.getStartNode(),
325 nominal.getEndNode(),
326 PERSON_CATEGORY,
327 personFeatures);
328 //Out.println("creating as new Person");
329 continue;
330 }
331
332 // Associate this entity with the most recent Person
333 int personIndex = 0;
334
335 Annotation previousPerson =
336 (Annotation) previousPeople.get(personIndex);
337
338 // Don't associate if the two nominals are not the same gender
339 String personGender = (String)
340 previousPerson.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
341 String jobTitleGender = (String)
342 nominal.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
343 if (personGender != null && jobTitleGender != null) {
344 if (! personGender.equals(jobTitleGender)) {
345 //Out.println("wrong gender: " + personGender + " " +
346 // jobTitleGender);
347 continue;
348 }
349 }
350
351 //Out.println("associating with " +
352 // previousPerson.getFeatures()
353 // .get(TOKEN_STRING_FEATURE_NAME));
354
355 anaphor2antecedent.put(nominal, previousPerson);
356 }
357 else if (nominal.getType().equals(ORGANIZATION_CATEGORY)) {
358 // Add each organization entity to the beginning of
359 // the organization list
360 previousOrgs.add(0, nominal);
361 //Out.println("added organization");
362 }
363 else if (nominal.getType().equals(LOOKUP_CATEGORY)) {
364 // Don't associate it if we have no organizations
365 if (previousOrgs.size() == 0) {
366 //Out.println("no orgs");
367 continue;
368 }
369
370 // Look into the tokens to get some info about POS.
371 Annotation[] orgNounTokens =
372 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
373 nominal.getStartNode().getOffset(),
374 nominal.getEndNode().getOffset()).toArray(new Annotation[0]);
375 java.util.Arrays.sort(orgNounTokens, new OffsetComparator());
376 Annotation lastToken = (Annotation)
377 orgNounTokens[orgNounTokens.length - 1];
378
379 // Don't associate if the org noun is not a singular noun
380 if (! lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)
381 .equals("NN")) {
382 //Out.println("Not a singular noun");
383 continue;
384 }
385
386 //Out.println("organization noun");
387 // Associate this entity with the most recent Person
388 anaphor2antecedent.put(nominal, previousOrgs.get(0));
389 }
390 }
391
392 // This method does the dirty work of actually adding new annotations and
393 // coreferring.
394 generateCorefChains(anaphor2antecedent);
395 }
396
397 /**
398 * This method specifies whether a given annotation overlaps any of a
399 * set of annotations. For instance, JobTitles occasionally are
400 * part of Person annotations.
401 *
402 */
403 private boolean overlapsAnnotations(Annotation a,
404 AnnotationSet annotations) {
405 Iterator<Annotation> iter = annotations.iterator();
406 while (iter.hasNext()) {
407 Annotation current = iter.next();
408 if (a.overlaps(current)) {
409 return true;
410 }
411 }
412
413 return false;
414 }
415
416 /** Use this method to keep the current token pointer at the right point
417 * in the token list */
418 private int advanceTokenPosition(Annotation target, int currentPosition,
419 Object[] tokens) {
420 long targetOffset = target.getStartNode().getOffset().longValue();
421 long currentOffset = ((Annotation) tokens[currentPosition])
422 .getStartNode().getOffset().longValue();
423
424 if (targetOffset > currentOffset) {
425 while (targetOffset > currentOffset) {
426 currentPosition++;
427 currentOffset = ((Annotation) tokens[currentPosition])
428 .getStartNode().getOffset().longValue();
429 }
430 }
431 else if (targetOffset < currentOffset) {
432 while (targetOffset < currentOffset) {
433 currentPosition--;
434 currentOffset = ((Annotation) tokens[currentPosition])
435 .getStartNode().getOffset().longValue();
436 }
437 }
438
439 return currentPosition;
440 }
441
442 /** Return the number of tokens between the end of annotation 1 and the
443 * beginning of annotation 2. Will return 0 if they are not in order */
444 private int countInterveningTokens(Annotation first, Annotation second,
445 int currentPosition, Object[] tokens) {
446 int interveningTokens = 0;
447
448 long startOffset = first.getEndNode().getOffset().longValue();
449 long endOffset = second.getStartNode().getOffset().longValue();
450
451 long currentOffset = ((Annotation) tokens[currentPosition])
452 .getStartNode().getOffset().longValue();
453
454 while (currentOffset < endOffset) {
455 if (currentOffset >= startOffset) {
456 interveningTokens++;
457 }
458 currentPosition++;
459 currentOffset = ((Annotation) tokens[currentPosition])
460 .getStartNode().getOffset().longValue();
461 }
462 return interveningTokens;
463 }
464
465 /** Get the next token after an annotation */
466 private Annotation getFollowingToken(Annotation current, int currentPosition,
467 Object[] tokens) {
468 long endOffset = current.getEndNode().getOffset().longValue();
469 long currentOffset = ((Annotation) tokens[currentPosition])
470 .getStartNode().getOffset().longValue();
471 while (currentOffset < endOffset) {
472 currentPosition++;
473 currentOffset = ((Annotation) tokens[currentPosition])
474 .getStartNode().getOffset().longValue();
475 }
476 return (Annotation) tokens[currentPosition];
477 }
478
479 /** Get the text of an annotation */
480 private String stringValue(Annotation ann) {
481 Object[] tokens = getSortedTokens(ann);
482
483 StringBuffer output = new StringBuffer();
484 for (int i=0;i<tokens.length;i++) {
485 Annotation token = (Annotation) tokens[i];
486 output.append(token.getFeatures().get(TOKEN_STRING_FEATURE_NAME));
487 if (i < tokens.length - 1) {
488 output.append(" ");
489 }
490 }
491 return output.toString();
492 }
493
494 /** Get a sorted array of the tokens that make up a given annotation. */
495 private Annotation[] getSortedTokens(Annotation a) {
496 Annotation[] annotationTokens =
497 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
498 a.getStartNode().getOffset(),
499 a.getEndNode().getOffset()).toArray(new Annotation[0]);
500 java.util.Arrays.sort(annotationTokens, new OffsetComparator());
501 return annotationTokens;
502 }
503
504 /** --- */
505 public HashMap getResolvedAnaphora() {
506 return this.anaphor2antecedent;
507 }
508
509 /** --- */
510 private void preprocess() throws ExecutionException {
511
512 //0.5 cleanup
513 this.anaphor2antecedent.clear();
514
515 //1.get all annotation in the input set
516 if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
517 this.defaultAnnotations = this.document.getAnnotations();
518 }
519 else {
520 this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
521 }
522
523 //if none found, print warning and exit
524 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
525 Err.prln("Coref Warning: No annotations found for processing!");
526 return;
527 }
528
529 /*
530 // initialise the quoted text fragments
531 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
532
533 //if none then return
534 if (null == sentQuotes) {
535 this.quotedText = new Quote[0];
536 }
537 else {
538 this.quotedText = new Quote[sentQuotes.size()];
539
540 Object[] quotesArray = sentQuotes.toArray();
541 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
542
543 for (int i =0; i < quotesArray.length; i++) {
544 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
545 }
546 }
547 */
548 }
549
550 }
|