001 /*
002 * TestDocument.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Hamish Cunningham, 21/Jan/00
013 *
014 * $Id: TestDocument.java 12919 2010-08-03 10:31:37Z valyt $
015 */
016
017 package gate.corpora;
018
019 import java.io.*;
020 import java.net.URL;
021 import java.net.UnknownHostException;
022 import java.util.*;
023
024 import junit.framework.*;
025
026 import gate.*;
027 import gate.util.BomStrippingInputStreamReader;
028 import gate.util.Err;
029 import gate.util.GateException;
030 import gate.util.SimpleFeatureMapImpl;
031
032 /** Tests for the Document classes
033 */
034 public class TestDocument extends TestCase
035 {
036
037 /** Debug flag */
038 private static final boolean DEBUG = false;
039
040 /** Construction */
041 public TestDocument(String name) { super(name); setUp();}
042
043 /** Base of the test server URL */
044 protected static String testServer = null;
045
046 /** Name of test document 1 */
047 protected String testDocument1;
048
049 /** Fixture set up */
050 public void setUp() {
051
052 try{
053 // Gate.init();
054 testServer = Gate.getUrl().toExternalForm();
055 } catch (GateException e){
056 e.printStackTrace(Err.getPrintWriter());
057 }
058
059 testDocument1 = "tests/html/test2.htm";
060 } // setUp
061
062 /** Get the name of the test server */
063 public static String getTestServerName() {
064 if(testServer != null) return testServer;
065 else{
066 try { testServer = Gate.getUrl().toExternalForm(); }
067 catch(Exception e) { }
068 return testServer;
069 }
070 }
071
072 /** Test ordering */
073 public void testCompareTo() throws Exception{
074 Document doc1 = null;
075 Document doc2 = null;
076 Document doc3 = null;
077
078
079 doc1 = Factory.newDocument(new URL(testServer + "tests/def"));
080 doc2 = Factory.newDocument(new URL(testServer + "tests/defg"));
081 doc3 = Factory.newDocument(new URL(testServer + "tests/abc"));
082
083 assertTrue(doc1.compareTo(doc2) < 0);
084 assertTrue(doc1.compareTo(doc1) == 0);
085 assertTrue(doc1.compareTo(doc3) > 0);
086
087 } // testCompareTo()
088
089 /** Test loading of the original document content */
090
091 public void testOriginalContentPreserving() throws Exception {
092 Document doc = null;
093 FeatureMap params;
094 String encoding = "UTF-8";
095 String origContent;
096
097 // test the default value of preserve content flag
098 params = Factory.newFeatureMap();
099 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, new URL(testServer + testDocument1));
100 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
101 doc =
102 (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
103
104 origContent = (String) doc.getFeatures().get(
105 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
106
107 assertNull(
108 "The original content should not be preserved without demand.",
109 origContent);
110
111 params = Factory.newFeatureMap();
112 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
113 new URL(testServer + testDocument1));
114 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
115 params.put(Document.DOCUMENT_PRESERVE_CONTENT_PARAMETER_NAME, new Boolean(true));
116 doc =
117 (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
118
119 origContent = (String) doc.getFeatures().get(
120 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
121
122 assertNotNull("The original content is not preserved on demand.",
123 origContent);
124
125 assertTrue("The original content size is zerro.", origContent.length()>0);
126 } // testOriginalContentPreserving()
127
128 /** A comprehensive test */
129 public void testLotsOfThings() {
130
131 // check that the test URL is available
132 URL u = null;
133 try{
134 u = new URL(testServer + testDocument1);
135 } catch (Exception e){
136 e.printStackTrace(Err.getPrintWriter());
137 }
138
139 // get some text out of the test URL
140 BufferedReader uReader = null;
141 try {
142 uReader = new BomStrippingInputStreamReader(u.openStream());
143 assertEquals(uReader.readLine(), "<HTML>");
144 } catch(UnknownHostException e) { // no network connection
145 return;
146 } catch(IOException e) {
147 fail(e.toString());
148 }
149 /*
150 Document doc = new TextualDocument(testServer + testDocument1);
151 AnnotationGraph ag = new AnnotationGraphImpl();
152
153 Tokeniser t = ... doc.getContent()
154 tokenise doc using java stream tokeniser
155
156 add several thousand token annotation
157 select a subset
158 */
159 } // testLotsOfThings
160
161
162 public void testDocRender() throws Exception
163 {
164 Document doc = Factory.newDocument("Hi Mom");
165 doc.getAnnotations().add(new Long(0), new Long(2),
166 "Foo", new SimpleFeatureMapImpl());
167 String content = doc.toXml(doc.getAnnotations(), false);
168
169 // Will fail, content is "<Foo>Hi Mom</Foo>"
170 assertEquals("<Foo>Hi</Foo> Mom", content);
171 }
172
173
174 /** The reason this is method begins with verify and not with test is that it
175 * gets called by various other test methods. It is somehow a utility test
176 * method. It should be called on all gate documents having annotation sets.
177 */
178 public static void verifyNodeIdConsistency(gate.Document doc)throws Exception{
179 if (doc == null) return;
180 Map offests2NodeId = new HashMap();
181 // Test the default annotation set
182 AnnotationSet annotSet = doc.getAnnotations();
183 verifyNodeIdConsistency(annotSet,offests2NodeId, doc);
184 // Test all named annotation sets
185 if (doc.getNamedAnnotationSets() != null){
186 Iterator namedAnnotSetsIter =
187 doc.getNamedAnnotationSets().values().iterator();
188 while(namedAnnotSetsIter.hasNext()){
189 verifyNodeIdConsistency((gate.AnnotationSet) namedAnnotSetsIter.next(),
190 offests2NodeId,
191 doc);
192 }// End while
193 }// End if
194 // Test suceeded. The map is not needed anymore.
195 offests2NodeId = null;
196 }// verifyNodeIdConsistency();
197
198 /** This metod runs the test over an annotation Set. It is called from her
199 * older sister. Se above.
200 * @param annotSet is the annotation set being tested.
201 * @param offests2NodeId is the Map used to test the consistency.
202 * @param doc is used in composing the assert error messsage.
203 */
204 public static void verifyNodeIdConsistency(gate.AnnotationSet annotSet,
205 Map offests2NodeId,
206 gate.Document doc)
207 throws Exception{
208
209 if (annotSet == null || offests2NodeId == null) return;
210
211 Iterator<Annotation> iter = annotSet.iterator();
212 while(iter.hasNext()){
213 Annotation annot = iter.next();
214 String annotSetName = (annotSet.getName() == null)? "Default":
215 annotSet.getName();
216 // check the Start node
217 if (offests2NodeId.containsKey(annot.getStartNode().getOffset())){
218 assertEquals("Found two different node IDs for the same offset( "+
219 annot.getStartNode().getOffset()+ " ).\n" +
220 "START NODE is buggy for annotation(" + annot +
221 ") from annotation set " + annotSetName + " of GATE document :" +
222 doc.getSourceUrl(),
223 annot.getStartNode().getId(),
224 (Integer) offests2NodeId.get(annot.getStartNode().getOffset()));
225 }// End if
226 // Check the End node
227 if (offests2NodeId.containsKey(annot.getEndNode().getOffset())){
228 assertEquals("Found two different node IDs for the same offset("+
229 annot.getEndNode().getOffset()+ ").\n" +
230 "END NODE is buggy for annotation(" + annot+ ") from annotation"+
231 " set " + annotSetName +" of GATE document :" + doc.getSourceUrl(),
232 annot.getEndNode().getId(),
233 (Integer) offests2NodeId.get(annot.getEndNode().getOffset()));
234 }// End if
235 offests2NodeId.put(annot.getStartNode().getOffset(),
236 annot.getStartNode().getId());
237 offests2NodeId.put(annot.getEndNode().getOffset(),
238 annot.getEndNode().getId());
239 }// End while
240 }//verifyNodeIdConsistency();
241
242 /**
243 * Test to verify behaviour of the mimeType init parameter.
244 */
245 public void testExplicitMimeType() throws Exception {
246 // override the user config to make sure we DON'T add extra space on
247 // unpackMarkup when parsing XML, whatever is set in the user config file.
248 Object savedAddSpaceValue = Gate.getUserConfig().get(
249 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME);
250 Gate.getUserConfig().put(
251 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME, "false");
252
253 try {
254 String testXmlString = "<p>This is a <strong>TEST</strong>.</p>";
255 String xmlParsedContent = "This is a TEST.";
256 String htmlParsedContent = "This is a TEST.\n";
257
258 // if we create a Document from this string WITHOUT setting a mime type,
259 // it should be treated as plain text and not parsed.
260 FeatureMap docParams = Factory.newFeatureMap();
261 docParams.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
262 testXmlString);
263 docParams.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME,
264 Boolean.TRUE);
265
266 Document noMimeTypeDoc = (Document)Factory.createResource(
267 DocumentImpl.class.getName(), docParams);
268
269 assertEquals("Document created with no explicit mime type should have "
270 + "unparsed XML as content.", testXmlString,
271 noMimeTypeDoc.getContent().toString());
272
273 assertEquals("Document created with no explicit mime type should not "
274 + "have any Original markups annotations.", 0,
275 noMimeTypeDoc.getAnnotations(
276 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size());
277
278 Factory.deleteResource(noMimeTypeDoc);
279 noMimeTypeDoc = null;
280
281 // if we create the same document with an explicit mime type of text/xml,
282 // it should be parsed properly, and have two original markups
283 // annotations.
284 docParams.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, "text/xml");
285
286 Document xmlDoc = (Document)Factory.createResource(
287 DocumentImpl.class.getName(), docParams);
288
289 assertEquals("Document created with explicit mime type should have been "
290 + "parsed as XML.", xmlParsedContent,
291 xmlDoc.getContent().toString());
292
293 assertEquals("Document created with explicit mime type has wrong number "
294 + "of Original markups annotations.", 2,
295 xmlDoc.getAnnotations(
296 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size());
297
298 Factory.deleteResource(xmlDoc);
299 xmlDoc = null;
300
301 // if we create the same document with an explicit mime type of text/html,
302 // it should be parsed properly and have *4* original markups
303 // annotations, as the HTML parser creates enclosing <html> and <body>
304 // elements and a zero-length <head> annotation.
305 docParams.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, "text/html");
306
307 Document htmlDoc = (Document)Factory.createResource(
308 DocumentImpl.class.getName(), docParams);
309
310 assertEquals("Document created with explicit mime type should have been "
311 + "parsed as HTML.", htmlParsedContent,
312 htmlDoc.getContent().toString());
313
314 assertEquals("Document created with explicit mime type has wrong number "
315 + "of Original markups annotations.", 5,
316 htmlDoc.getAnnotations(
317 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).size());
318
319 Factory.deleteResource(htmlDoc);
320 htmlDoc = null;
321 }
322 finally {
323 // restore the saved value for ADD_SPACE_ON_MARKUP_UNPACK
324 if(savedAddSpaceValue == null) {
325 Gate.getUserConfig().remove(
326 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME);
327 }
328 else {
329 Gate.getUserConfig().put(
330 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME,
331 savedAddSpaceValue);
332 }
333 }
334 }
335
336 /** Test suite routine for the test runner */
337 public static Test suite() {
338 return new TestSuite(TestDocument.class);
339 } // suite
340
341 } // class TestDocument
|