001 /*
002 * HtmlLinkExtractor.java
003 *
004 * Copyright (c) 1995-2010, The University of Sheffield. See the file
005 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
006 *
007 * This file is part of GATE (see http://gate.ac.uk/), and is free
008 * software, licenced under the GNU Library General Public License,
009 * Version 2, June 1991 (in the distribution as file licence.html,
010 * and also available at http://gate.ac.uk/gate/licence.html).
011 *
012 * Cristian URSU, 16/Nov/2001
013 *
014 * $Id: HtmlLinksExtractor.java 12006 2009-12-01 17:24:28Z thomas_heitz $
015 */
016
017 package gate.util;
018
019 import java.io.*;
020 import java.util.*;
021
022 import javax.swing.text.BadLocationException;
023 import javax.swing.text.MutableAttributeSet;
024 import javax.swing.text.html.HTML;
025 import javax.swing.text.html.HTMLEditorKit;
026 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
027 import javax.swing.text.html.parser.ParserDelegator;
028
029 /**
030 * This class extracts links from HTML files.
031 * <B>It has been hacked</B> to build the contents of
032 * <A HREF="http://gate.ac.uk/sitemap.html">http://gate.ac.uk/sitemap.html</A>;
033 * you <B>probably don't want to use it</B> for anything else!
034 * <P>
035 * Implements the behaviour of the HTML reader.
036 * Methods of an object of this class are called by the HTML parser when
037 * events will appear.
038 */
039 public class HtmlLinksExtractor extends ParserCallback {
040
041 /** Debug flag */
042 private static final boolean DEBUG = false;
043
044 /** The tag currently being processed */
045 private HTML.Tag currentTag = null;
046
047 /** whether we've done a title before */
048 static boolean firstTitle = true;
049
050 /** will contain </UL> after first title */
051 static String endUl = "";
052
053 /** Name of the file we're currently processing */
054 static String currFile = "";
055
056 /** Path to the file we're currently processing */
057 static String currPath = "";
058
059 /** This method is called when the HTML parser encounts the beginning
060 * of a tag that means that the tag is paired by an end tag and it's
061 * not an empty one.
062 */
063 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
064
065 currentTag = t;
066 if (HTML.Tag.A == t){
067 Out.pr("<LI><" + t);
068 String href = "";
069 Enumeration e = a.getAttributeNames();
070 while(e.hasMoreElements()) {
071 HTML.Attribute name = (HTML.Attribute) e.nextElement();
072 String value = (String) a.getAttribute(name);
073
074 if(name == HTML.Attribute.HREF) {
075 if(
076 value.startsWith("http:") || value.startsWith("HTTP:") ||
077 value.startsWith("file:") || value.startsWith("FILE:") ||
078 value.startsWith("mailto:") || value.startsWith("MAILTO:") ||
079 value.startsWith("ftp:") || value.startsWith("FTP:")
080 )
081 Out.pr(" HREF=\"" + value + "\"");
082 else { // if it is a relative path....
083 Out.pr(" HREF=\"" + currPath + "/" + value + "\"");
084 }
085 }
086 } // while
087
088 Out.pr(">");
089 }// End if
090
091 if (HTML.Tag.TITLE == t){
092 Out.pr(endUl + "<H3>");
093 if(firstTitle) { firstTitle = false; endUl = "</UL>"; }
094 }// End if
095
096 }//handleStartTag
097
098 private void printAttributes(MutableAttributeSet a){
099 if (a == null) return;
100 // Take all the attributes an put them into the feature map
101 if (0 != a.getAttributeCount()){
102 Enumeration enumeration = a.getAttributeNames();
103 while (enumeration.hasMoreElements()){
104 Object attribute = enumeration.nextElement();
105 Out.pr(" "+ attribute.toString() + "=\"" +
106 a.getAttribute(attribute).toString()+"\"");
107 }// End while
108 }// End if
109 }// printAttributes();
110
111 /** This method is called when the HTML parser encounts the end of a tag
112 * that means that the tag is paired by a beginning tag
113 */
114 public void handleEndTag(HTML.Tag t, int pos){
115 currentTag = null;
116
117 if (HTML.Tag.A == t)
118 Out.pr("</"+t+">\n");
119 if (HTML.Tag.TITLE == t)
120 Out.pr(
121 "</H3></A>\n\n<P>Links in: <A HREF=\"" + currFile +
122 "\">" + currFile + "</A>:\n<UL>\n"
123 );
124
125 }//handleEndTag
126
127 /** This method is called when the HTML parser encounts an empty tag
128 */
129 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
130 if (HTML.Tag.A == t){
131 Out.pr("<"+t);
132 printAttributes(a);
133 Out.pr("/>\n");
134 }// End if
135
136 if (HTML.Tag.TITLE == t){
137 Out.pr("<"+t);
138 printAttributes(a);
139 Out.pr("/>\n");
140 }// End if
141 } // handleSimpleTag
142
143 /** This method is called when the HTML parser encounts text (PCDATA)*/
144 public void handleText(char[] text, int pos){
145
146 if(HTML.Tag.A == currentTag){
147 //text of tag A
148 String tagText = new String(text);
149 Out.pr(tagText);
150 }// End if
151
152 if(HTML.Tag.TITLE == currentTag){
153 //text of tag A
154 String tagText = new String(text);
155 Out.pr(tagText);
156 }// End if
157
158 }// end handleText();
159
160 /**
161 * This method is called when the HTML parser encounts an error
162 * it depends on the programmer if he wants to deal with that error
163 */
164 public void handleError(String errorMsg, int pos) {
165 //Out.println ("ERROR CALLED : " + errorMsg);
166 }
167
168 /** This method is called once, when the HTML parser reaches the end
169 * of its input streamin order to notify the parserCallback that there
170 * is nothing more to parse.
171 */
172 public void flush() throws BadLocationException{
173 }// flush
174
175 /** This method is called when the HTML parser encounts a comment
176 */
177 public void handleComment(char[] text, int pos) {
178 }
179
180 /**
181 * Given a certain folder it lists recursively all the files contained
182 * in that folder. It returns a list of strings representing the file
183 * names
184 */
185 private static List listAllFiles(File aFile, Set foldersToIgnore){
186 java.util.List sgmlFileNames = new ArrayList();
187 java.util.List foldersToExplore = new ArrayList();
188 if (!aFile.isDirectory()){
189 // add the file to the file list
190 sgmlFileNames.add(aFile.getPath());
191 return sgmlFileNames;
192 }// End if
193 listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore);
194 return sgmlFileNames;
195 } // listAllFiles();
196
197 /** Helper method for listAllFiles */
198 private static void listFilesRec(File aFile,
199 java.util.List fileNames,
200 java.util.List foldersToExplore,
201 Set foldersToIgnore){
202
203 String[] fileList = aFile.list();
204 for (int i=0; i< fileList.length; i++){
205 File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]);
206 if (tmpFile.isDirectory()){
207 // If the file is not included
208 if (!foldersToIgnore.contains(tmpFile.getName())) { //fileList[i])) {
209 if(DEBUG) {
210 Err.prln("adding dir: " + tmpFile);
211 Err.prln(" name: " + tmpFile.getName());
212 }
213 foldersToExplore.add(tmpFile);
214 }
215 }else{
216 // only process .html files
217 if(
218 ( fileList[i].toLowerCase().endsWith(".html") ) ||
219 ( fileList[i].toLowerCase().endsWith(".htm") )
220 ) fileNames.add(tmpFile.getPath());
221 }// End if
222 }// End for
223
224 while(!foldersToExplore.isEmpty()){
225 File folder = (File)foldersToExplore.get(0);
226 foldersToExplore.remove(0);
227 listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore);
228 }//End while
229
230 } // listFilesRec();
231
232 /** Extract links from all .html files below a directory */
233 public static void main(String[] args){
234 HTMLEditorKit.Parser parser = new ParserDelegator();
235 // create a new Htmldocument handler
236 HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor();
237
238 if (args.length == 0){
239 Out.prln(
240 "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt"
241 );
242 return;
243 }
244 // Create a folder file File
245 File htmlFolder = new File(args[0]);
246 Set foldersToIgnore = new HashSet();
247 for(int i = 1; i<args.length; i++)
248 foldersToIgnore.add(args[i]);
249
250 List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore);
251 //Collections.sort(htmlFileNames);
252 while (!htmlFileNames.isEmpty()){
253 try{
254 String htmlFileName = (String) htmlFileNames.get(0);
255 currFile = htmlFileName;
256 currPath = new File(currFile).getParent().toString();
257 htmlFileNames.remove(0);
258
259 Out.prln("\n\n<A HREF=\"file://" + htmlFileName + "\">");
260 Reader reader = new FileReader(htmlFileName);
261 // parse the HTML document
262 parser.parse(reader, htmlDocHandler, true);
263 } catch (IOException e){
264 e.printStackTrace(System.out);
265 }// End try
266 }// End while
267 System.err.println("done.");
268 }// main
269
270 }//End class HtmlLinksExtractor
271
272
|