001 /* Parser.java -- HTML parser
002 Copyright (C) 2005 Free Software Foundation, Inc.
003
004 This file is part of GNU Classpath.
005
006 GNU Classpath is free software; you can redistribute it and/or modify
007 it under the terms of the GNU General Public License as published by
008 the Free Software Foundation; either version 2, or (at your option)
009 any later version.
010
011 GNU Classpath is distributed in the hope that it will be useful, but
012 WITHOUT ANY WARRANTY; without even the implied warranty of
013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014 General Public License for more details.
015
016 You should have received a copy of the GNU General Public License
017 along with GNU Classpath; see the file COPYING. If not, write to the
018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019 02110-1301 USA.
020
021 Linking this library statically or dynamically with other modules is
022 making a combined work based on this library. Thus, the terms and
023 conditions of the GNU General Public License cover the whole
024 combination.
025
026 As a special exception, the copyright holders of this library give you
027 permission to link this library with independent modules to produce an
028 executable, regardless of the license terms of these independent
029 modules, and to copy and distribute the resulting executable under
030 terms of your choice, provided that you also meet, for each linked
031 independent module, the terms and conditions of the license of that
032 module. An independent module is a module which is not derived from
033 or based on this library. If you modify this library, you may extend
034 this exception to your version of the library, but you are not
035 obligated to do so. If you do not wish to do so, delete this
036 exception statement from your version. */
037
038
039 package javax.swing.text.html.parser;
040
041 import java.io.IOException;
042 import java.io.Reader;
043
044 import javax.swing.text.ChangedCharSetException;
045 import javax.swing.text.SimpleAttributeSet;
046
047 /*
048 * FOR DEVELOPERS: To avoid regression, please run the package test
049 * textsuite/javax.swing.text.html.parser/AllParserTests after your
050 * modifications.
051 */
052
053 /**
054 * <p>A simple error-tolerant HTML parser that uses a DTD document
055 * to access data on the possible tokens, arguments and syntax.</p>
056 * <p> The parser reads an HTML content from a Reader and calls various
057 * notifying methods (which should be overridden in a subclass)
058 * when tags or data are encountered.</p>
059 * <p>Some HTML elements need no opening or closing tags. The
060 * task of this parser is to invoke the tag handling methods also when
061 * the tags are not explicitly specified and must be supposed using
062 * information, stored in the DTD.
063 * For example, parsing the document
064 * <p><table><tr><td>a<td>b<td>c</tr> <br>
065 * will invoke exactly the handling methods exactly in the same order
066 * (and with the same parameters) as if parsing the document: <br>
067 * <em><html><head></head><body><table><
068 * tbody></em><tr><td>a<em></td></em><td>b<em>
069 * </td></em><td>c<em></td></tr></em><
070 * <em>/tbody></table></body></html></em></p>
071 * (supposed tags are given in italics). The parser also supports
072 * obsolete elements of HTML syntax.<p>
073 * </p>
074 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
075 */
076 public class Parser
077 implements DTDConstants
078 {
079 /**
080 * The document template description that will be used to parse the documents.
081 */
082 protected DTD dtd;
083
084 /**
085 * The value of this field determines whether or not the Parser will be
086 * strict in enforcing SGML compatibility. The default value is false,
087 * stating that the parser should do everything to parse and get at least
088 * some information even from the incorrectly written HTML input.
089 */
090 protected boolean strict;
091
092 /**
093 * The package level reference to the working HTML parser in this
094 * implementation.
095 */
096 final gnu.javax.swing.text.html.parser.support.Parser gnu;
097
098 /**
099 * Creates a new parser that uses the given DTD to access data on the
100 * possible tokens, arguments and syntax. There is no single - step way
101 * to get a default DTD; you must either refer to the implementation -
102 * specific packages, write your own DTD or obtain the working instance
103 * of parser in other way, for example, by calling
104 * {@link javax.swing.text.html.HTMLEditorKit#getParser() }.
105 * @param a_dtd A DTD to use.
106 */
107 public Parser(DTD a_dtd)
108 {
109 dtd = a_dtd;
110
111 final Parser j = this;
112
113 gnu =
114 new gnu.javax.swing.text.html.parser.support.Parser(dtd)
115 {
116 protected final void handleComment(char[] comment)
117 {
118 j.handleComment(comment);
119 }
120
121 protected final void handleEOFInComment()
122 {
123 j.handleEOFInComment();
124 }
125
126 protected final void handleEmptyTag(TagElement tag)
127 throws javax.swing.text.ChangedCharSetException
128 {
129 j.handleEmptyTag(tag);
130 }
131
132 protected final void handleStartTag(TagElement tag)
133 {
134 j.handleStartTag(tag);
135 }
136
137 protected final void handleEndTag(TagElement tag)
138 {
139 j.handleEndTag(tag);
140 }
141
142 protected final void handleError(int line, String message)
143 {
144 j.handleError(line, message);
145 }
146
147 protected final void handleText(char[] text)
148 {
149 j.handleText(text);
150 }
151
152 protected final void handleTitle(char[] title)
153 {
154 j.handleTitle(title);
155 }
156
157 protected final void markFirstTime(Element element)
158 {
159 j.markFirstTime(element);
160 }
161
162 protected final void startTag(TagElement tag)
163 throws ChangedCharSetException
164 {
165 j.startTag(tag);
166 }
167
168 protected final void endTag(boolean omitted)
169 {
170 j.endTag(omitted);
171 }
172
173 protected TagElement makeTag(Element element)
174 {
175 return j.makeTag(element);
176 }
177
178 protected TagElement makeTag(Element element, boolean isSupposed)
179 {
180 return j.makeTag(element, isSupposed);
181 }
182 };
183 }
184
185 /**
186 * Parse the HTML text, calling various methods in response to the
187 * occurence of the corresponding HTML constructions.
188 * @param reader The reader to read the source HTML from.
189 * @throws IOException If the reader throws one.
190 */
191 public synchronized void parse(Reader reader)
192 throws IOException
193 {
194 gnu.parse(reader);
195 }
196
197 /**
198 * Parses DTD markup declaration. Currently returns without action.
199 * @return null.
200 * @throws java.io.IOException
201 */
202 public String parseDTDMarkup()
203 throws IOException
204 {
205 return gnu.parseDTDMarkup();
206 }
207
208 /**
209 * Parse DTD document declarations. Currently only parses the document
210 * type declaration markup.
211 * @param strBuff
212 * @return true if this is a valid DTD markup declaration.
213 * @throws IOException
214 */
215 protected boolean parseMarkupDeclarations(StringBuffer strBuff)
216 throws IOException
217 {
218 return gnu.parseMarkupDeclarations(strBuff);
219 }
220
221 /**
222 * Get the attributes of the current tag.
223 * @return The attribute set, representing the attributes of the current tag.
224 */
225 protected SimpleAttributeSet getAttributes()
226 {
227 return gnu.getAttributes();
228 }
229
230 /**
231 * Get the number of the document line being parsed.
232 * @return The current line.
233 */
234 protected int getCurrentLine()
235 {
236 return gnu.hTag.where.beginLine;
237 }
238
239 /**
240 * Get the current position in the document being parsed.
241 * @return The current position.
242 */
243 protected int getCurrentPos()
244 {
245 return gnu.hTag.where.startPosition;
246 }
247
248 /**
249 * The method is called when the HTML end (closing) tag is found or if
250 * the parser concludes that the one should be present in the
251 * current position. The method is called immediatly
252 * before calling the handleEndTag().
253 * @param omitted True if the tag is no actually present in the document,
254 * but is supposed by the parser (like </html> at the end of the
255 * document).
256 */
257 protected void endTag(boolean omitted)
258 {
259 // This default implementation does nothing.
260 }
261
262 /**
263 * Invokes the error handler. The default method in this implementation
264 * finally delegates the call to handleError, also providing the number of the
265 * current line.
266 */
267 protected void error(String msg)
268 {
269 gnu.error(msg);
270 }
271
272 /**
273 * Invokes the error handler. The default method in this implementation
274 * finally delegates the call to error (msg+": '"+invalid+"'").
275 */
276 protected void error(String msg, String invalid)
277 {
278 gnu.error(msg, invalid);
279 }
280
281 /**
282 * Invokes the error handler. The default method in this implementation
283 * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3).
284 */
285 protected void error(String parm1, String parm2, String parm3)
286 {
287 gnu.error(parm1, parm2, parm3);
288 }
289
290 /**
291 * Invokes the error handler. The default method in this implementation
292 * finally delegates the call to error
293 * (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
294 */
295 protected void error(String parm1, String parm2, String parm3, String parm4)
296 {
297 gnu.error(parm1, parm2, parm3, parm4);
298 }
299
300 /**
301 * In this implementation, this is never called and returns without action.
302 */
303 protected void flushAttributes()
304 {
305 gnu.flushAttributes();
306 }
307
308 /**
309 * Handle HTML comment. The default method returns without action.
310 * @param comment The comment being handled
311 */
312 protected void handleComment(char[] comment)
313 {
314 // This default implementation does nothing.
315 }
316
317 /**
318 * This is additionally called in when the HTML content terminates
319 * without closing the HTML comment. This can only happen if the
320 * HTML document contains errors (for example, the closing --;gt is
321 * missing. The default method calls the error handler.
322 */
323 protected void handleEOFInComment()
324 {
325 gnu.error("Unclosed comment");
326 }
327
328 /**
329 * Handle the tag with no content, like <br>. The method is
330 * called for the elements that, in accordance with the current DTD,
331 * has an empty content.
332 * @param tag The tag being handled.
333 * @throws javax.swing.text.ChangedCharSetException
334 */
335 protected void handleEmptyTag(TagElement tag)
336 throws ChangedCharSetException
337 {
338 // This default implementation does nothing.
339 }
340
341 /**
342 * The method is called when the HTML closing tag ((like </table>)
343 * is found or if the parser concludes that the one should be present
344 * in the current position.
345 * @param tag The tag being handled
346 */
347 protected void handleEndTag(TagElement tag)
348 {
349 // This default implementation does nothing.
350 }
351
352 /* Handle error that has occured in the given line. */
353 protected void handleError(int line, String message)
354 {
355 // This default implementation does nothing.
356 }
357
358 /**
359 * The method is called when the HTML opening tag ((like <table>)
360 * is found or if the parser concludes that the one should be present
361 * in the current position.
362 * @param tag The tag being handled
363 */
364 protected void handleStartTag(TagElement tag)
365 {
366 // This default implementation does nothing.
367 }
368
369 /**
370 * Handle the text section.
371 * <p> For non-preformatted section, the parser replaces
372 * \t, \r and \n by spaces and then multiple spaces
373 * by a single space. Additionaly, all whitespace around
374 * tags is discarded.
375 * </p>
376 * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
377 * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n,
378 * if it is present. Additionally, it replaces each occurence of \r or \r\n
379 * by a single \n.</p>
380 *
381 * @param text A section text.
382 */
383 protected void handleText(char[] text)
384 {
385 // This default implementation does nothing.
386 }
387
388 /**
389 * Handle HTML <title> tag. This method is invoked when
390 * both title starting and closing tags are already behind.
391 * The passed argument contains the concatenation of all
392 * title text sections.
393 * @param title The title text.
394 */
395 protected void handleTitle(char[] title)
396 {
397 // This default implementation does nothing.
398 }
399
400 /**
401 * Constructs the tag from the given element. In this implementation,
402 * this is defined, but never called.
403 * @param element the base element of the tag.
404 * @return the tag
405 */
406 protected TagElement makeTag(Element element)
407 {
408 return makeTag(element, false);
409 }
410
411 /**
412 * Constructs the tag from the given element.
413 * @param element the tag base {@link javax.swing.text.html.parser.Element}
414 * @param isSupposed true if the tag is not actually present in the
415 * html input, but the parser supposes that it should to occur in
416 * the current location.
417 * @return the tag
418 */
419 protected TagElement makeTag(Element element, boolean isSupposed)
420 {
421 return new TagElement(element, isSupposed);
422 }
423
424 /**
425 * This is called when the tag, representing the given element,
426 * occurs first time in the document.
427 * @param element
428 */
429 protected void markFirstTime(Element element)
430 {
431 // This default implementation does nothing.
432 }
433
434 /**
435 * The method is called when the HTML opening tag ((like <table>)
436 * is found or if the parser concludes that the one should be present
437 * in the current position. The method is called immediately before
438 * calling the handleStartTag.
439 * @param tag The tag
440 */
441 protected void startTag(TagElement tag)
442 throws ChangedCharSetException
443 {
444 // This default implementation does nothing.
445 }
446 }