001 /*
002 * Sgml2Xml.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 4/July/2000
013 *
014 * $Id: Sgml2Xml.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.sgml;
018
019 import java.io.File;
020 import java.io.IOException;
021 import java.net.MalformedURLException;
022 import java.util.*;
023
024 import gate.Document;
025 import gate.util.Files;
026
027
028 /**
029 * Not so fast...
030 * This class is not a realy Sgml2Xml convertor.
031 * It takes an SGML document and tries to prepare it for an XML parser
032 * For a true conversion we need an Java SGML parser...
033 * If you know one let me know....
034 *
035 * What does it do:
036 * <ul>
037 * <li>If it finds something like this : <element attribute = value>
038 * it will produce: <element attribute = "value">
039 * <li>If it finds something like this : <element something
040 * attribute2=value>it will produce : <element
041 * defaultAttribute="something" attribute2="value">
042 * <li>If it finds : <element att1='value1 value2' att2="value2
043 * value3"> it will produce: <element att1="value1 value2"
044 * att2="value2 value3">
045 * <li>If it finds : <element1> <elem>text </element1>
046 * will produce: <element1> <elem>text<elem>
047 * </element1>
048 * <li>If it find : <element1> <elem>[white spaces]
049 * </element1>,
050 * it will produce:<element1> <elem/>[white spaces]<
051 * /element1>
052 * </ul>
053 * What doesn't:
054 * <ul>
055 * <li>Doesn't expand the entities. So the entities from the SGML document
056 * must be resolved by the XML parser
057 * <li>Doesn't replace internal entities with their corresponding value
058 * </ul>
059 */
060
061 public class Sgml2Xml{
062
063 /** Debug flag */
064 private static final boolean DEBUG = false;
065
066 /**
067 * The constructor initialises some member fields
068 * @param SgmlDoc the content of the Sgml document that will be modified
069 */
070 public Sgml2Xml(String SgmlDoc){
071 // create a new modifier
072 m_modifier = new StringBuffer(SgmlDoc);
073 // create a new dobiousElements list
074 // se the explanatin at the end of the class
075 dubiousElements = new ArrayList();
076 stack = new Stack();
077 }
078
079 /**
080 * The other constructor
081 * @param doc The Gate document that will be transformed to XML
082 */
083 public Sgml2Xml(Document doc){
084 // set as a member
085 m_doc = doc;
086
087 // create a new modifier
088 m_modifier = new StringBuffer(m_doc.getContent().toString());
089
090 // create a new dobiousElements list
091 // se the explanatin at the end of the class
092 dubiousElements = new ArrayList();
093 stack = new Stack();
094
095 }
096
097 /* I keep this just in case I need some more debuging
098
099 public static void main(String[] args){
100 Sgml2Xml convertor =
101 new Sgml2Xml("<w VVI='res trtetre\" relu = \"stop\">say
102 <w VBZ>is\n<trunc> <w UNC>th </trunc>");
103 try{
104 Out.println(convertor.convert());
105 } catch (Exception e){
106 e.printStackTrace(Err.getPrintWriter());
107 }
108 }
109 */
110
111 /**
112 * It analises the char that was red in state 1
113 * If it finds '<' it then goes to state 2
114 * Otherwise it stays in state 1 and keeps track about the text that is not
115 * white spaces.
116 */
117 private void doState1(char currChar){
118 if ('<' == currChar){
119 // change to state 2
120 m_currState = 2;
121 if (!stack.isEmpty()){
122 // peek the element from the top of the stack
123 CustomObject o = (CustomObject) stack.peek();
124 // set some properties for this element
125 // first test to find out if text folows this element charPos > 0
126 if (charPos > 0){
127 // this is not an empty element because there is text that follows
128 // set the element from the top of the stack to be a non empty one
129 o.setClosePos(charPos);
130 o.setEmpty(false);
131 // reset the charPos
132 charPos = 0;
133 }//if (charPos > 0)
134 }//if (!stack.isEmpty())
135 }//if ('<' == m_currChar)
136 // if currChar is not whiteSpace then save the position of the last
137 // char that was read
138 if (('<' != currChar) && !isWhiteSpace(currChar))
139 charPos = m_cursor;
140 }//doState1
141
142 /**
143 We came from state 1 and just read '<'
144 If currChar == '/' -> state 11
145 If is a char != white spaces -> state 3
146 stay in state 2 while there are only white spaces
147 */
148 private void doState2(char currChar){
149 if ('/' == currChar){
150 // go to state 11
151 m_currState = 11;
152 }
153 // if currChar is a char != white spaces then go to state 3
154 if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){
155 // save the position where starts the element's name
156 // we need that in order to be able to read the current tag name
157 // this name it will be read from m_modifier using the substring() method
158 elemNameStart = m_cursor -1;
159 // go to state 3
160 m_currState = 3;
161 }
162 }// doState2
163
164 /**
165 * Just read the first char from the element's name and now analize the next
166 * char.
167 * If '>' the elem name was a single char -> state 1
168 * IF is WhiteSpaces -> state 4
169 * Otherwise stay in state 3 and read the elemnt's name
170 */
171 private void doState3(char currChar){
172 if ( '>' == currChar ){
173
174 // save the pos where the element's name ends
175 elemNameEnd = m_cursor - 1;
176
177 // this is also the pos where to insert '/' for empty elements.
178 // In this case we have this situation <w> sau < w>
179 closePos = m_cursor - 1;
180
181 // get the name of the element
182 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
183
184 // we put the element into stack
185 // we think in this point that the element is empty...
186 performFinalAction(elemName, closePos);
187
188 // go to state 1
189 m_currState = 1;
190 }
191 if (isWhiteSpace(currChar)){
192 // go to state 4
193 m_currState = 4;
194
195 // save the pos where the element's name ends
196 elemNameEnd = m_cursor - 1;
197
198 // get the name of the element
199 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
200 }
201 }// doState3
202
203 /**
204 * We read the name of the element and we prepare for '>' or attributes
205 * '>' -> state 1
206 * any char !- white space -> state 5
207 */
208 private void doState4(char currChar){
209 if ( '>' == currChar ){
210 // this is also the pos where to insert '/' for empty elements in this case
211 closePos = m_cursor -1 ;
212
213 // we put the element into stack
214 // we think in this point that the element is empty...
215 performFinalAction(elemName, closePos);
216
217 // go to state 1
218 m_currState = 1;
219 }
220 if (( '>' != currChar ) && !isWhiteSpace(currChar)){
221 // we just read the first char from the attrib name or attrib value..
222 // go to state 5
223 m_currState = 5;
224
225 // remember the position where starts the attrib or the value of an attrib
226 attrStart = m_cursor - 1;
227 }
228 } // doState4
229
230 /**
231 * '=' -> state 6
232 * '>' -> state 4 (we didn't read an attribute but a value of the
233 * defaultAtt )
234 * WS (white spaces) we don't know yet if we read an attribute or the value
235 * of the defaultAttr -> state 10
236 * This state modifies the content onf m_modifier ... it adds text
237 */
238 private void doState5(char currChar){
239 if ( '=' == currChar )
240 m_currState = 6;
241 if ( '>' == currChar ){
242 // this mean that the attribute was a value and we have to create
243 // a default attribute
244 // the same as in state 10
245 attrEnd = m_cursor - 1 ;
246 m_modifier.insert(attrEnd,'"');
247 m_modifier.insert(attrStart,"defaultAttr=\"");
248
249 // go to state 4
250 m_currState = 4;
251
252 // parse again the entire sequence from state 4 before reading any char
253 m_cursor = attrStart;
254 }
255 if (isWhiteSpace(currChar)){
256 // go to state 10
257 m_currState = 10;
258
259 // record the position where ends this attribute
260 attrEnd = m_cursor - 1;
261 }
262 } // doState5
263
264 /**
265 * IF we read ' or " then we have to get prepared to read everything until
266 * the next ' or "
267 * If we read a char then -> state 8;
268 * Stay here while we read WS
269 */
270 private void doState6(char currChar){
271 if ( ('\'' == currChar) || ('"' == currChar) ){
272 endPair = currChar;
273 if ('\'' == currChar){
274
275 // we have to replace ' with "
276 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
277 }
278 m_currState = 7;
279 }
280 if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){
281
282 // this means that curChar is any char
283 m_currState = 8;
284
285 // every value must be inside this pair""
286 m_modifier.insert(m_cursor - 1, '"');
287
288 // insert implies the modification of m_cursor
289 // we increment m_cursor in order to say in the same position and to
290 // anulate the efect of insert.
291 m_cursor ++;
292 }
293 }// doState6
294
295 /**
296 * If we find the pair ' or " go to state 9
297 * Otherwhise read everything and stay in state 7
298 * If in state 7 we read '>' then we add automaticaly a " at the end and go
299 * to state 1
300 */
301 private void doState7(char currChar){
302 //if ( ('\'' == currChar) || ('"' == currChar) ){
303 if ( endPair == currChar ){
304 if ('\'' == currChar){
305
306 // we have to replace ' with "
307 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
308 }
309 // reset the endPair
310 endPair = ' ';
311 m_currState = 9;
312 }
313
314 if ('>' == currChar){
315 // go to state 1
316 m_currState = 1;
317
318 // insert the final " ata the end
319 m_modifier.insert(m_cursor - 1, '"');
320
321 // go to te current possition (because of insert)
322 m_cursor ++;
323
324 performFinalAction(elemName, m_cursor - 1);
325 }
326
327 }// doState7
328
329 /**
330 * If '>' go to state 1
331 * If WS go to state 9
332 * Stays in state 8 and read the attribute's value
333 */
334 private void doState8(char currChar){
335
336 if ('>' == currChar){
337 // go to state 1
338 m_currState = 1;
339
340 // complete the end " ( <elem attr="value> )
341 m_modifier.insert(m_cursor - 1, '"');
342
343 // go to te current possition (because of insert)
344 m_cursor ++;
345
346 // we finished to read a beggining tag
347 // see the method definition for more details
348 performFinalAction(elemName, m_cursor - 1);
349 }
350 if (isWhiteSpace(currChar)){
351 // go to state 9
352 m_currState = 9;
353
354 // add the ending " char
355 m_modifier.insert(m_cursor - 1, '"');
356
357 // increment the cursor in order to anulate the effect of insert
358 m_cursor ++;
359 }
360 } // doState8
361 /**
362 * Here we prepare to read another attrib, value pair (any char -> state 5)
363 * If '>' we just read a beggining tag -> state 1
364 * Stay here while read WS
365 */
366 private void doState9(char currChar){
367 if ('>' == currChar){
368 // go to state 1
369 m_currState = 1;
370
371 // add the object to the stack
372 performFinalAction(elemName, m_cursor - 1);
373 }
374 if (('>' != currChar) && !isWhiteSpace(m_currChar)){
375 // this is the same as state 4->5
376 m_currState = 5;
377 attrStart = m_cursor - 1;
378 }
379 }//doState9
380
381 /**
382 * If any C -> state 4
383 * If '=' state 6
384 * Stays here while reads WS
385 */
386 private void doState10(char currChar){
387 if ('=' == currChar)
388 m_currState = 6;
389 if ( ('=' != currChar) && !isWhiteSpace(currChar)){
390 // this mean that the attribute was a value and we have to create
391 // a default attribute
392 m_modifier.insert(attrEnd,'"');
393 m_modifier.insert(attrStart,"defaultAttr=\"");
394
395 // go to state 4
396 m_currState = 4;
397
398 m_cursor = attrStart;
399 }
400 }// doState10
401
402 /**
403 * We are preparing to read the and definition of an element
404 * Stays in this state while reading WS
405 */
406 private void doState11(char currChar){
407 if (!isWhiteSpace(currChar)){
408 m_currState = 12;
409 elemNameStart = m_cursor - 1;
410 }
411 } // doState11
412
413 /**
414 * Here we read the element's name ...this is an end tag
415 * Stays here while reads a char
416 */
417 private void doState12(char currChar) {
418 if ('>' == currChar){
419 elemNameEnd = m_cursor - 1;
420 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
421 performActionWithEndElem(elemName);
422 m_currState = 1;
423 }
424 if (isWhiteSpace(currChar)){
425 m_currState = 13;
426 elemNameEnd = m_cursor - 1;
427 }
428 }//doState12
429
430 /**
431 * If '>' -> state 1
432 * Stays here while reads WS
433 */
434 private void doState13(char currChar) {
435 if ('>' == currChar){
436 elemName = m_modifier.substring(elemNameStart,elemNameEnd);
437 performActionWithEndElem(elemName);
438 m_currState = 1;
439 }
440 } // doState13
441
442 /**
443 This method is responsable with document conversion
444 */
445 public String convert()throws IOException,MalformedURLException {
446 while (thereAreCharsToBeProcessed()) {
447 // read() gets the next char and increment the m_cursor
448 m_currChar = read();
449 switch(m_currState){
450 case 1: doState1(m_currChar);break;
451 case 2: doState2(m_currChar);break;
452 case 3: doState3(m_currChar);break;
453 case 4: doState4(m_currChar);break;
454 case 5: doState5(m_currChar);break;
455 case 6: doState6(m_currChar);break;
456 case 7: doState7(m_currChar);break;
457 case 8: doState8(m_currChar);break;
458 case 9: doState9(m_currChar);break;
459 case 10: doState10(m_currChar);break;
460 case 11: doState11(m_currChar);break;
461 case 12: doState12(m_currChar);break;
462 case 13: doState13(m_currChar);break;
463 }// switch(m_currState)
464 }// while (thereAreCharsToBeProcessed())
465
466 // put all the elements from the stack into the dubiousElements list
467 // we do that in order to colect all the dubious elements
468 while (!stack.isEmpty()) {
469 CustomObject obj = (CustomObject) stack.pop();
470 dubiousElements.add(obj);
471 }
472
473 // sort the dubiousElements list descending on closePos...
474 // This is vital for the alghorithm because we have to make
475 // all the modifications from the bottom to the top...
476 // If we fail to do that, insert will change indices and
477 // CustomObject.getClosePos() will not be acurate anymore.
478 Collections.sort(dubiousElements, new MyComparator());
479
480 //here we resolve all the dubious Elements...
481 // see the description of makeFinalModifications() method
482 ListIterator listIterator = dubiousElements.listIterator();
483 while (listIterator.hasNext()){
484 CustomObject obj = (CustomObject) listIterator.next();
485 makeFinalModifications(obj);
486 }
487
488 //finally add the XML prolog
489 m_modifier.insert(0,"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
490 //Out.println(m_modifier.toString());
491 /*
492 // get a InputStream from m_modifier and write it into a temp file
493 // finally return the URI of the new XML document
494 ByteArrayInputStream is = new ByteArrayInputStream(
495 m_modifier.toString().getBytes()
496 );
497 */
498 // this method is in gate.util package
499 File file = Files.writeTempFile(m_modifier.toString(),"UTF-8");
500
501 //return m_doc.getSourceURL().toString();
502 return file.toURI().toURL().toString();
503 }// convert()
504
505 /**
506 * This method tests to see if there are more char to be read
507 * It will return false when there are no more chars to be read
508 */
509 private boolean thereAreCharsToBeProcessed() {
510 if (m_cursor < m_modifier.length()) return true;
511 else return false;
512 }//thereAreCharsToBeProcessed
513
514 /**
515 * This method reads a char and increments the m_cursor
516 */
517 private char read(){
518 return m_modifier.charAt(m_cursor ++);
519 }//read
520
521 /**
522 * This is the action when we finished to read the entire tag
523 * The action means that we put the tag into stack and consider that is empty
524 * as default
525 */
526 private void performFinalAction(String elemName, int pos) {
527 // create anew CustomObject
528 CustomObject obj = new CustomObject();
529
530 // set its properties
531 obj.setElemName(elemName);
532 obj.setClosePos(pos);
533
534 // default we consider every element to be empty
535 // in state 1 we modify that if the element is followed by text
536 obj.setEmpty(true);
537 stack.push(obj);
538 } // performFinalAction
539
540 /**
541 * This is the action performed when an end tag is read.
542 * The action consists in colecting all the dubiosElements(elements without
543 * an end tag). They are considered dubious because we don't know if they
544 * are empty or may be closed... Only the DTD can provide this information.
545 * We don't have a DTD so we will consider that all dubious elements
546 * followed by text will close at the end of the text...
547 * If a dubious element is followed by another element then is
548 * automaticaly considered an empty element.
549 *
550 * @param elemName is the the name of the end tag that was read
551 */
552 private void performActionWithEndElem(String elemName) {
553 CustomObject obj = null;
554 boolean stop = false;
555
556 // get all the elements that are dubious from the stack
557 // the iteration will stop when an element is equal with elemName
558 while (!stack.isEmpty() && !stop){
559
560 // eliminate the object from the stack
561 obj = (CustomObject) stack.pop();
562
563 //if its elemName is equal with the param elemName we stop the itteration
564 if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true;
565
566 // otherwhise add the element to the doubiousElements list
567 else dubiousElements.add(obj);
568 }
569 }//performActionWithEndElem
570
571 /**
572 * This method is called after we read the entire SGML document
573 * It resolves the dobious Elements this way:
574 * <ul>
575 * <li>
576 * 1. We don't have a DTD so we will consider that all dubious elements
577 * followed by text will close at the end of the text...
578 * <li>
579 * 2. If a dubious element is followed by another element then is
580 automaticaly considered an empty element.
581 *
582 * An element is considered dubious when we don't know if it is empty
583 * or may be closed...
584 *
585 * @param aCustomObject an object from the dubiousElements list
586 */
587 private void makeFinalModifications(CustomObject aCustomObject) {
588 String endElement = null;
589 // if the element is empty then we add / before > like this:
590 // <w> -> <w/>
591 if (aCustomObject.isEmpty())
592 m_modifier.insert(aCustomObject.getClosePos(),"/");
593 // otherwhise we create an end element
594 // <w> -> </w>
595 else{
596 // create the end element
597 endElement = "</" + aCustomObject.getElemName() + ">";
598 // insert it where the closePos indicates
599 m_modifier.insert(aCustomObject.getClosePos(), endElement);
600 }
601 } // makeFinalModifications
602
603 /**
604 * Tests if c is a white space char
605 */
606 private boolean isWhiteSpace(char c) {
607 return Character.isWhitespace(c);
608 }
609
610 // this is a gate Document... It's content will be transferred to
611 // m_modifier
612 private Document m_doc = null;
613
614 // this is the modifier that will transform an SGML document into an
615 // XML document
616 private StringBuffer m_modifier = null;
617
618 // we need the stack to be able to remember the order of the tags
619 private Stack stack = null;
620
621 // this is a list with all the tags that are not colsed...
622 // some of them are empty tags and some of them are not...
623 private List dubiousElements = null;
624
625 // this is tre current position inside the modifier
626 private int m_cursor = 0;
627
628 // the current state of the SGML2XML automata
629 private int m_currState = 1;
630
631 // the char that was read from the m_modifier @ position m_cursor
632 private char m_currChar = ' ';
633
634 // the fields above are used by the convert method and its auxiliary functions
635 // like doState1...13()
636
637 // indicates the last position of a text character (one which is not a white
638 // space)
639 // it is used in doState1() when we have to decide if an element is empty or
640 // not
641 // We decide that based on this field
642 // If the charPos > 0 then it means that the object from the top of stack
643 // is followed by text and we consider that is not empty
644 private int charPos = 0;
645
646 // is the current tag name
647 private String elemName = null;
648
649 // indicates where in the m_modifier begins the current tag elemName
650 private int elemNameStart = 0;
651
652 // indicates where in the m_modifier ends the current tag elemName
653 // we need that in order to be able to read the current tag name
654 // this name it will be read from m_modifier using the substring() method
655 // it will be something like this :
656 // elemName = m_modifier.substring(elemNameStart,elemNameEnd)
657 // Eg: <w attr1=val1> -> <[elemNameStart]w[elemNameEnd] [attr1=val1>
658 private int elemNameEnd = 0;
659
660 // this is the position there a start tag ends like this:
661 // Eg: <w attr1=val1> -> <w attr1=val1 [closePos]>
662 private int closePos = 0;
663
664 //this is the position where an attribute starts...
665 // we need it when we have to add the defaultAttr (see state 5)
666 private int attrStart = 0;
667
668 //this is the position where an attribute ends...
669 // we need it when we have to add the defaultAttr (see state 5) or to add "
670 // Eg: <w attr1=val1> -> <w [attrStart]attr1[attrEnd]=val1>
671 private int attrEnd = 0;
672
673 // endPair field is used in states 6 and 7....
674 // When we read something like this :
675 // attr=' val1 val2 val3' endPair remembers what is the pair for the beginning
676 // string
677 // Note that a combination like: attr = ' val1 val2 " will have an unexpected
678 // behaviour...
679 // We need this field when we have the following situation
680 // attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ".
681 // In this case we can't allow ' to be the endPair
682 private char endPair = ' ';
683
684 } // class Sgml2Xml
685
686 /**
687 * The objects belonging to this class are used inside the stack
688 */
689 class CustomObject {
690
691 // constructor
692 public CustomObject() {
693 elemName = null;
694 closePos = 0;
695 empty = false;
696 }
697
698 // accessor
699 public String getElemName() {
700 return elemName;
701 }
702
703 public int getClosePos() {
704 return closePos;
705 }
706
707 public boolean isEmpty() {
708 return empty;
709 }
710
711 // modifiers
712 void setElemName(String anElemName) {
713 elemName = anElemName;
714 }
715
716 void setClosePos(int aPos){
717 closePos = aPos;
718 }
719
720 void setEmpty(boolean anEmptyValue) {
721 empty = anEmptyValue;
722 }
723
724 // data fields
725 private String elemName = null;
726
727 private int closePos = 0;
728
729 private boolean empty = false;
730
731 } // CustomObject
732
733 class MyComparator implements Comparator {
734
735 public MyComparator() {
736 }
737
738 public int compare(Object o1, Object o2) {
739 if ( !(o1 instanceof CustomObject) ||
740 !(o2 instanceof CustomObject)) return 0;
741
742 CustomObject co1 = (CustomObject) o1;
743 CustomObject co2 = (CustomObject) o2;
744 int result = 0;
745 if (co1.getClosePos() < co2.getClosePos()) result = -1;
746 if (co1.getClosePos() == co2.getClosePos()) result = 0;
747 if (co1.getClosePos() > co2.getClosePos()) result = 1;
748
749 return -result;
750 } // compare
751
752 }// class MyComparator
|