001 /*
002 * TestXml.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 8/May/2000
013 *
014 * $Id: TestXml.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.xml;
018
019 import java.io.File;
020 import java.io.FileOutputStream;
021 import java.io.OutputStreamWriter;
022 import java.io.Writer;
023 import java.net.URL;
024 import java.util.*;
025 import java.text.NumberFormat;
026
027 import junit.framework.*;
028
029 import gate.*;
030 import gate.corpora.DocumentImpl;
031 import gate.creole.SerialAnalyserController;
032 import gate.util.Files;
033 import gate.util.Err;
034 import gate.util.persistence.PersistenceManager;
035 import gate.creole.ANNIEConstants;
036
037 //import org.w3c.www.mime.*;
038
039
040 /** Test class for XML facilities
041 *
042 */
043 public class TestXml extends TestCase
044 {
045 /** Debug flag */
046 private static final boolean DEBUG = false;
047
048 /** The encoding used in our tests*/
049 private static String workingEncoding="UTF-8";
050
051 /** Construction */
052 public TestXml(String name) { super(name); }
053
054 /** Fixture set up */
055 public void setUp() {
056 } // setUp
057
058 public void testGateDocumentToAndFromXmlWithDifferentKindOfFormats()
059 throws Exception{
060 List urlList = new LinkedList();
061 List urlDescription = new LinkedList();
062 URL url = null;
063
064 url = Gate.getUrl("tests/xml/xces.xml");
065 assertTrue("Coudn't create a URL object for tests/xml/xces.xml ", url != null);
066 urlList.add(url);
067 urlDescription.add(" an XML document ");
068
069 url = Gate.getUrl("tests/xml/Sentence.xml");
070 assertTrue("Coudn't create a URL object for tests/xml/Sentence.xml",
071 url != null);
072 urlList.add(url);
073 urlDescription.add(" an XML document ");
074
075 url = Gate.getUrl("tests/html/test1.htm");
076 assertTrue("Coudn't create a URL object for tests/html/test.htm",url != null);
077 urlList.add(url);
078 urlDescription.add(" an HTML document ");
079
080 url = Gate.getUrl("tests/rtf/Sample.rtf");
081 assertTrue("Coudn't create a URL object for defg ",url != null);
082 urlList.add(url);
083 urlDescription.add(" a RTF document ");
084
085
086 url = Gate.getUrl("tests/email/test2.eml");
087 assertTrue("Coudn't create a URL object for defg ",url != null);
088 urlList.add(url);
089 urlDescription.add(" an EMAIL document ");
090
091 Iterator iter = urlList.iterator();
092 Iterator descrIter = urlDescription.iterator();
093 while(iter.hasNext()){
094 runCompleteTestWithAFormat((URL) iter.next(),(String)descrIter.next());
095 }// End While
096
097
098 }// testGateDocumentToAndFromXmlWithDifferentKindOfFormats
099
100 private void runCompleteTestWithAFormat(URL url, String urlDescription)
101 throws Exception{
102 // Load the xml Key Document and unpack it
103 gate.Document keyDocument = null;
104
105 FeatureMap params = Factory.newFeatureMap();
106 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, url);
107 params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, "false");
108 keyDocument = (Document)Factory.createResource("gate.corpora.DocumentImpl",
109 params);
110
111 assertTrue("Coudn't create a GATE document instance for " +
112 url.toString() +
113 " Can't continue." , keyDocument != null);
114
115 gate.DocumentFormat keyDocFormat = null;
116 keyDocFormat = gate.DocumentFormat.getDocumentFormat(
117 keyDocument, keyDocument.getSourceUrl()
118 );
119
120 assertTrue("Fail to recognize " +
121 url.toString() +
122 " as being " + urlDescription + " !", keyDocFormat != null);
123
124 // Unpack the markup
125 keyDocFormat.unpackMarkup(keyDocument);
126 // Verfy if all annotations from the default annotation set are consistent
127 gate.corpora.TestDocument.verifyNodeIdConsistency(keyDocument);
128
129 // Verifies if the maximum annotation ID on the GATE doc is less than the
130 // Annotation ID generator of the document.
131 verifyAnnotationIDGenerator(keyDocument);
132
133 // Save the size of the document and the number of annotations
134 long keyDocumentSize = keyDocument.getContent().size().longValue();
135 int keyDocumentAnnotationSetSize = keyDocument.getAnnotations().size();
136
137
138 // Export the Gate document called keyDocument as XML, into a temp file,
139 // using the working encoding
140 File xmlFile = null;
141 xmlFile = Files.writeTempFile(keyDocument.toXml(), workingEncoding );
142 assertTrue("The temp GATE XML file is null. Can't continue.",xmlFile != null);
143
144 // Load the XML Gate document form the tmp file into memory
145 gate.Document gateDoc = null;
146 gateDoc = gate.Factory.newDocument(xmlFile.toURI().toURL(), workingEncoding);
147
148 assertTrue("Coudn't create a GATE document instance for " +
149 xmlFile.toURI().toURL().toString() +
150 " Can't continue." , gateDoc != null);
151
152 gate.DocumentFormat gateDocFormat = null;
153 gateDocFormat =
154 DocumentFormat.getDocumentFormat(gateDoc,gateDoc.getSourceUrl());
155
156 assertTrue("Fail to recognize " +
157 xmlFile.toURI().toURL().toString() +
158 " as being a GATE XML document !", gateDocFormat != null);
159
160 gateDocFormat.unpackMarkup(gateDoc);
161 // Verfy if all annotations from the default annotation set are consistent
162 gate.corpora.TestDocument.verifyNodeIdConsistency(gateDoc);
163
164 // Save the size of the document snd the number of annotations
165 long gateDocSize = keyDocument.getContent().size().longValue();
166 int gateDocAnnotationSetSize = keyDocument.getAnnotations().size();
167
168 assertTrue("Exporting as GATE XML resulted in document content size lost." +
169 " Something went wrong.", keyDocumentSize == gateDocSize);
170
171 assertTrue("Exporting as GATE XML resulted in annotation lost." +
172 " No. of annotations missing = " +
173 Math.abs(keyDocumentAnnotationSetSize - gateDocAnnotationSetSize),
174 keyDocumentAnnotationSetSize == gateDocAnnotationSetSize);
175
176 // Verifies if the maximum annotation ID on the GATE doc is less than the
177 // Annotation ID generator of the document.
178 verifyAnnotationIDGenerator(gateDoc);
179
180 //Don't need tmp Gate XML file.
181 xmlFile.delete();
182 }//runCompleteTestWithAFormat
183
184 /** A test */
185 public void testUnpackMarkup() throws Exception{
186 // create the markupElementsMap map
187 Map markupElementsMap = null;
188 gate.Document doc = null;
189 /*
190 markupElementsMap = new HashMap();
191 // populate it
192 markupElementsMap.put ("S","Sentence");
193 markupElementsMap.put ("s","Sentence");
194 */
195 // Create the element2String map
196 Map anElement2StringMap = null;
197 anElement2StringMap = new HashMap();
198 // Populate it
199 anElement2StringMap.put("S","\n");
200 anElement2StringMap.put("s","\n");
201
202 doc = gate.Factory.newDocument(Gate.getUrl("tests/xml/xces.xml"), workingEncoding);
203
204 AnnotationSet annotSet = doc.getAnnotations(
205 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
206 assertEquals("For "+doc.getSourceUrl()+" the number of annotations"+
207 " should be:758",758,annotSet.size());
208
209 gate.corpora.TestDocument.verifyNodeIdConsistency(doc);
210
211 // Verifies if the maximum annotation ID on the GATE doc is less than the
212 // Annotation ID generator of the document.
213 verifyAnnotationIDGenerator(doc);
214
215 } // testUnpackMarkup()
216
217 /*
218 * This method runs ANNIE with defaults on a document, then saves
219 * it as a GATE XML document and loads it back. All the annotations on the
220 * loaded document should be the same as the original ones.
221 *
222 * It also verifies if the matches feature still holds after an export/import to XML
223 */
224 public void testAnnotationConsistencyForSaveAsXml()throws Exception{
225 // Load a document from the test repository
226 //Document origDoc = gate.Factory.newDocument(Gate.getUrl("tests/xml/gateTestSaveAsXML.xml"));
227 String testDoc = gate.util.Files.getGateResourceAsString("gate.ac.uk/tests/xml/gateTestSaveAsXML.xml");
228 Document origDoc = gate.Factory.newDocument(testDoc);
229
230 // Verifies if the maximum annotation ID on the origDoc is less than the
231 // Annotation ID generator of the document.
232 verifyAnnotationIDGenerator(origDoc);
233
234 // Load ANNIE with defaults and run it on the document
235 SerialAnalyserController annie = (SerialAnalyserController)
236 PersistenceManager.loadObjectFromFile(new File(new File(
237 Gate.getPluginsHome(), ANNIEConstants.PLUGIN_DIR),
238 ANNIEConstants.DEFAULT_FILE));
239 assertTrue("ANNIE not loaded!", annie != null);
240 Corpus c = Factory.newCorpus("test");
241 c.add(origDoc);
242 annie.setCorpus(c);
243 annie.execute();
244
245 // SaveAS XML and reload the document into another GATE doc
246 // Export the Gate document called origDoc as XML, into a temp file,
247 // using the working encoding
248 File xmlFile = Files.writeTempFile(origDoc.toXml(),workingEncoding);
249 System.out.println("Saved to temp file :" + xmlFile.toURI().toURL());
250
251 Document reloadedDoc = gate.Factory.newDocument(xmlFile.toURI().toURL(), workingEncoding);
252 // Verifies if the maximum annotation ID on the origDoc is less than the
253 // Annotation ID generator of the document.
254 verifyAnnotationIDGenerator(reloadedDoc);
255
256 // Verify if the annotations are identical in the two docs.
257 Map origAnnotMap = buildID2AnnotMap(origDoc);
258 Map reloadedAnnMap = buildID2AnnotMap(reloadedDoc);
259
260 //Verifies if the reloaded annotations are the same as the original ones
261 verifyIDConsistency(origAnnotMap, reloadedAnnMap);
262
263 // Build the original Matches map
264 // ID -> List of IDs
265 Map origMatchesMap = buildMatchesMap(origDoc);
266 // Verify the consistency of matches
267 // Compare every orig annotation pointed by the MatchesMap with the reloadedAnnot
268 // extracted from the reloadedMAp
269 for(Iterator it = origMatchesMap.keySet().iterator(); it.hasNext();){
270 Integer id = (Integer)it.next();
271 Annotation origAnnot = (Annotation) origAnnotMap.get(id);
272 assertTrue("Couldn't find an original annot with ID=" + id, origAnnot != null);
273 Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
274 assertTrue("Couldn't find a reloaded annot with ID=" + id, reloadedAnnot != null);
275 compareAnnot(origAnnot,reloadedAnnot);
276 // Iterate through the matches list and repeat the comparison
277 List matchesList = (List) origMatchesMap.get(id);
278 for (Iterator itList = matchesList.iterator(); itList.hasNext();){
279 Integer matchId = (Integer) itList.next();
280 Annotation origA = (Annotation) origAnnotMap.get(matchId);
281 assertTrue("Couldn't find an original annot with ID=" + matchId, origA != null);
282 Annotation reloadedA = (Annotation) reloadedAnnMap.get(matchId);
283 assertTrue("Couldn't find a reloaded annot with ID=" + matchId, reloadedA != null);
284 compareAnnot(origA, reloadedA);
285 }// End for
286 }// End for
287 // Clean up the XMl file
288 xmlFile.delete();
289 }// End testAnnotationIDConsistencyForSaveAsXml
290
291 /**
292 * Builds a Map based on the matches feature of some annotations. The goal is to
293 * use this map to validate the annotations from the reloaded document.
294 * In case no Annot has the matches feat, will return an Empty MAP
295 * @param doc The document of which annotations will be used to construct the map
296 * @return A Map from Annot ID -> Lists of Annot IDs
297 */
298 private Map buildMatchesMap(Document doc){
299 Map matchesMap = new HashMap();
300 // Scan the default annotation set
301 AnnotationSet annotSet = doc.getAnnotations();
302
303 helperBuildMatchesMap(annotSet, matchesMap);
304 // Scan all named annotation sets
305 if (doc.getNamedAnnotationSets() != null){
306 for ( Iterator namedAnnotSetsIter = doc.getNamedAnnotationSets().values().iterator();
307 namedAnnotSetsIter.hasNext(); ){
308 helperBuildMatchesMap((gate.AnnotationSet) namedAnnotSetsIter.next(), matchesMap);
309 }// End while
310 }// End if
311 return matchesMap;
312 }// End of buildMatchesMap()
313
314 /**
315 * This is a helper metod. It scans an annotation set and adds the ID of the annotations
316 * which have the matches feature to the map.
317 * @param sourceAnnotSet The annotation set investigated
318 * @param aMap
319 */
320 private void helperBuildMatchesMap(AnnotationSet sourceAnnotSet, Map aMap ){
321
322 for (Iterator it = sourceAnnotSet.iterator(); it.hasNext();){
323 Annotation a = (Annotation) it.next();
324 FeatureMap aFeatMap = a.getFeatures();
325 // Skip those annotations who don't have features
326 if (aFeatMap == null) continue;
327 // Extract the matches feat
328 List matchesVal = (List) aFeatMap.get("matches");
329 if (matchesVal == null) continue;
330 Integer id = a.getId();
331 aMap.put(id,matchesVal);
332 }//End for
333
334 }// End of helperBuildMatchesMap()
335
336 /**
337 * This method tests if the generator for new Annotation IDs is greather than the
338 * maximum Annotation ID present in the GATE document. In oter words, it ensures that
339 * new Annotations will receive an UNIQUE ID.
340 *
341 * @param aDoc The GATE document being tested
342 */
343 protected void verifyAnnotationIDGenerator(gate.Document aDoc){
344 // Creates a MAP containing all the annotations of the document.
345 // In doing so, it also tests if there are annotations with the same ID.
346 Map id2AnnotationMap = buildID2AnnotMap(aDoc);
347
348 if (id2AnnotationMap == null || id2AnnotationMap.isEmpty()){
349 //System.out.println("No annotations found on the document! Nothing to test.");
350 return;
351 }
352
353 // Get the key set of the Map and sort them
354 Set keysSet = id2AnnotationMap.keySet();
355 TreeSet sortedSet = new TreeSet(keysSet);
356 // Get the highest Annotation ID
357 Integer maxAnnotId = (Integer) sortedSet.last();
358 // Compare its value to the one hold by the document's ID generator
359 Integer generatorId = ((DocumentImpl)aDoc).getNextAnnotationId();
360
361 // System.out.println("maxAnnotid = " + maxAnnotId + " generatorID = " + generatorId);
362
363 assertTrue("Annotation ID generator["+generatorId+"] on document [" + aDoc.getSourceUrl() +
364 "] was equal or less than the MAX Annotation ID["+maxAnnotId+"] on the document."+
365 " This may lead to Annotation ID conflicts.", generatorId.intValue() > maxAnnotId.intValue());
366
367
368 }// End of verifyAnnotationIDGenerator()
369
370 /**
371 * Verifies if the two maps hold annotations with the same ID. The only thing not checked
372 * are the features, as some of them could be lost in the serialization/deserialization process
373 * @param origAnnotMap A map by ID, containing the original annotations
374 * @param reloadedAnnMap A map by ID, containing the recreated annotations
375 */
376 private void verifyIDConsistency(Map origAnnotMap, Map reloadedAnnMap) {
377 assertEquals("Found a different number of annot in both documents.",
378 origAnnotMap.keySet().size(), reloadedAnnMap.keySet().size());
379
380 // List orig = new ArrayList(origAnnotMap.keySet());
381 // Collections.sort(orig);
382 // System.out.println("ORIG SET =" + orig);
383 //
384 // List rel = new ArrayList(reloadedAnnMap.keySet());
385 // Collections.sort(rel);
386 // System.out.println("REL SET =" + rel);
387 //
388
389 for (Iterator it = origAnnotMap.keySet().iterator(); it.hasNext();){
390 Integer id = (Integer) it.next();
391 Annotation origAnn = (Annotation) origAnnotMap.get(id);
392 Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
393
394 assertTrue("Annotation with ID="+ id +" was not found in the reloaded document.", reloadedAnnot != null);
395 compareAnnot(origAnn, reloadedAnnot);
396
397 }// End for
398 }// End of verifyIDConsistency()
399
400 /**
401 * Thes if two annotatiosn are the same, except their features.
402 * @param origAnn
403 * @param reloadedAnnot
404 */
405 private void compareAnnot(Annotation origAnn, Annotation reloadedAnnot) {
406 assertTrue("Found original and reloaded annot without the same ID!",
407 origAnn.getId().equals(reloadedAnnot.getId()));
408 assertTrue("Found original and reloaded annot without the same TYPE!\n"+
409 "Original was ["+origAnn.getType()+"] and reloaded was ["+reloadedAnnot.getType()+"].",
410 origAnn.getType().equals(reloadedAnnot.getType()));
411 assertTrue("Found original and reloaded annot without the same START offset!",
412 origAnn.getStartNode().getOffset().equals(reloadedAnnot.getStartNode().getOffset()));
413 assertTrue("Found original and reloaded annot without the same END offset!",
414 origAnn.getEndNode().getOffset().equals(reloadedAnnot.getEndNode().getOffset()));
415 }// End of compareAnnot()
416
417
418 private Map addAnnotSet2Map(AnnotationSet annotSet, Map id2AnnMap){
419 for (Iterator it = annotSet.iterator(); it.hasNext();){
420 Annotation a = (Annotation) it.next();
421 Integer id = a.getId();
422
423 assertTrue("Found two annotations(one with type = " + a.getType() +
424 ")with the same ID=" + id, !id2AnnMap.keySet().contains(id));
425
426 id2AnnMap.put(id, a);
427 }// End for
428 return id2AnnMap;
429 }
430
431 /**
432 * Scans a target Doc for all Annotations and builds a map (from anot ID to annot) in the process
433 * I also checks to see if there are two annotations with the same ID.
434 * @param aDoc The GATE doc to be scaned
435 * @return a Map ID2Annot
436 */
437 private Map buildID2AnnotMap(Document aDoc){
438 Map id2AnnMap = new HashMap();
439 // Scan the default annotation set
440 AnnotationSet annotSet = aDoc.getAnnotations();
441 addAnnotSet2Map(annotSet, id2AnnMap);
442 // Scan all named annotation sets
443 if (aDoc.getNamedAnnotationSets() != null){
444 for ( Iterator namedAnnotSetsIter = aDoc.getNamedAnnotationSets().values().iterator();
445 namedAnnotSetsIter.hasNext(); ){
446
447 addAnnotSet2Map((gate.AnnotationSet) namedAnnotSetsIter.next(), id2AnnMap);
448 }// End while
449 }// End if
450 return id2AnnMap;
451 }// End of buildID2AnnotMap()
452
453 /** Test suite routine for the test runner */
454 public static Test suite() {
455 return new TestSuite(TestXml.class);
456 } // suite
457
458 } // class TestXml
|