001    /* Parser.java -- HTML parser
002       Copyright (C) 2005 Free Software Foundation, Inc.
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    
039    package javax.swing.text.html.parser;
040    
041    import java.io.IOException;
042    import java.io.Reader;
043    
044    import javax.swing.text.ChangedCharSetException;
045    import javax.swing.text.SimpleAttributeSet;
046    
047    /*
048     * FOR DEVELOPERS: To avoid regression, please run the package test
049     * textsuite/javax.swing.text.html.parser/AllParserTests after your
050     * modifications.
051     */
052    
053    /**
054     * <p>A simple error-tolerant HTML parser that uses a DTD document
055     * to access data on the possible tokens, arguments and syntax.</p>
056     * <p> The parser reads an HTML content from a Reader and calls various
057     * notifying methods (which should be overridden in a subclass)
058     * when tags or data are encountered.</p>
059     * <p>Some HTML elements need no opening or closing tags. The
060     * task of this parser is to invoke the tag handling methods also when
061     * the tags are not explicitly specified and must be supposed using
062     * information, stored in the DTD.
063     * For  example, parsing the document
064     * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
065     * will invoke exactly the handling methods exactly in the same order
066     * (and with the same parameters) as if parsing the document: <br>
067     * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
068     * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
069     * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
070     * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
071     * (supposed tags are given in italics). The parser also supports
072     * obsolete elements of HTML syntax.<p>
073     * </p>
074     * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
075     */
076    public class Parser 
077      implements DTDConstants
078    {
079      /**
080       * The document template description that will be used to parse the documents.
081       */
082      protected DTD dtd;
083    
084      /**
085       * The value of this field determines whether or not the Parser will be
086       * strict in enforcing SGML compatibility. The default value is false,
087       * stating that the parser should do everything to parse and get at least
088       * some information even from the incorrectly written HTML input.
089       */
090      protected boolean strict;
091    
092      /**
093       * The package level reference to the working HTML parser in this
094       * implementation.
095       */
096      final gnu.javax.swing.text.html.parser.support.Parser gnu;
097    
098      /**
099       * Creates a new parser that uses the given DTD to access data on the
100       * possible tokens, arguments and syntax. There is no single - step way
101       * to get a default DTD; you must either refer to the implementation -
102       * specific packages, write your own DTD or obtain the working instance
103       * of parser in other way, for example, by calling
104       * {@link javax.swing.text.html.HTMLEditorKit#getParser() }.
105       * @param a_dtd A DTD to use.
106       */
107      public Parser(DTD a_dtd)
108      {
109        dtd = a_dtd;
110    
111        final Parser j = this;
112    
113        gnu =
114          new gnu.javax.swing.text.html.parser.support.Parser(dtd)
115            {
116              protected final void handleComment(char[] comment)
117              {
118                j.handleComment(comment);
119              }
120    
121              protected final void handleEOFInComment()
122              {
123                j.handleEOFInComment();
124              }
125    
126              protected final void handleEmptyTag(TagElement tag)
127                throws javax.swing.text.ChangedCharSetException
128              {
129                j.handleEmptyTag(tag);
130              }
131    
132              protected final void handleStartTag(TagElement tag)
133              {
134                j.handleStartTag(tag);
135              }
136    
137              protected final void handleEndTag(TagElement tag)
138              {
139                j.handleEndTag(tag);
140              }
141    
142              protected final void handleError(int line, String message)
143              {
144                j.handleError(line, message);
145              }
146    
147              protected final void handleText(char[] text)
148              {
149                j.handleText(text);
150              }
151    
152              protected final void handleTitle(char[] title)
153              {
154                j.handleTitle(title);
155              }
156    
157              protected final void markFirstTime(Element element)
158              {
159                j.markFirstTime(element);
160              }
161    
162              protected final void startTag(TagElement tag)
163                throws ChangedCharSetException
164              {
165                j.startTag(tag);
166              }
167    
168              protected final void endTag(boolean omitted)
169              {
170                j.endTag(omitted);
171              }
172    
173              protected TagElement makeTag(Element element)
174              {
175                return j.makeTag(element);
176              }
177    
178              protected TagElement makeTag(Element element, boolean isSupposed)
179              {
180                return j.makeTag(element, isSupposed);
181              }
182            };
183      }
184    
185      /**
186       * Parse the HTML text, calling various methods in response to the
187       * occurence of the corresponding HTML constructions.
188       * @param reader The reader to read the source HTML from.
189       * @throws IOException If the reader throws one.
190       */
191      public synchronized void parse(Reader reader)
192        throws IOException
193      {
194        gnu.parse(reader);
195      }
196    
197      /**
198       * Parses DTD markup declaration. Currently returns without action.
199       * @return null.
200       * @throws java.io.IOException
201       */
202      public String parseDTDMarkup()
203        throws IOException
204      {
205        return gnu.parseDTDMarkup();
206      }
207    
208      /**
209       * Parse DTD document declarations. Currently only parses the document
210       * type declaration markup.
211       * @param strBuff
212       * @return true if this is a valid DTD markup declaration.
213       * @throws IOException
214       */
215      protected boolean parseMarkupDeclarations(StringBuffer strBuff)
216        throws IOException
217      {
218        return gnu.parseMarkupDeclarations(strBuff);
219      }
220    
221      /**
222       * Get the attributes of the current tag.
223       * @return The attribute set, representing the attributes of the current tag.
224       */
225      protected SimpleAttributeSet getAttributes()
226      {
227        return gnu.getAttributes();
228      }
229    
230      /**
231       * Get the number of the document line being parsed.
232       * @return The current line.
233       */
234      protected int getCurrentLine()
235      {
236        return gnu.hTag.where.beginLine;
237      }
238    
239      /**
240       * Get the current position in the document being parsed.
241       * @return The current position.
242       */
243      protected int getCurrentPos()
244      {
245        return gnu.hTag.where.startPosition;
246      }
247    
248      /**
249       * The method is called when the HTML end (closing) tag is found or if
250       * the parser concludes that the one should be present in the
251       * current position. The method is called immediatly
252       * before calling the handleEndTag().
253       * @param omitted True if the tag is no actually present in the document,
254       * but is supposed by the parser (like &lt;/html&gt; at the end of the
255       * document).
256       */
257      protected void endTag(boolean omitted)
258      {
259        // This default implementation does nothing.
260      }
261    
262      /**
263       * Invokes the error handler. The default method in this implementation
264       * finally delegates the call to handleError, also providing the number of the
265       * current line.
266       */
267      protected void error(String msg)
268      {
269        gnu.error(msg);
270      }
271    
272      /**
273       * Invokes the error handler. The default method in this implementation
274       * finally delegates the call to error (msg+": '"+invalid+"'").
275       */
276      protected void error(String msg, String invalid)
277      {
278        gnu.error(msg, invalid);
279      }
280    
281      /**
282       * Invokes the error handler. The default method in this implementation
283       * finally delegates the call to error (parm1+" "+ parm2+" "+ parm3).
284       */
285      protected void error(String parm1, String parm2, String parm3)
286      {
287        gnu.error(parm1, parm2, parm3);
288      }
289    
290      /**
291       * Invokes the error handler. The default method in this implementation
292       * finally delegates the call to error
293       * (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
294       */
295      protected void error(String parm1, String parm2, String parm3, String parm4)
296      {
297        gnu.error(parm1, parm2, parm3, parm4);
298      }
299    
300      /**
301       * In this implementation, this is never called and returns without action.
302       */
303      protected void flushAttributes()
304      {
305        gnu.flushAttributes();
306      }
307    
308      /**
309       * Handle HTML comment. The default method returns without action.
310       * @param comment The comment being handled
311       */
312      protected void handleComment(char[] comment)
313      {
314        // This default implementation does nothing.
315      }
316    
317      /**
318       * This is additionally called in when the HTML content terminates
319       * without closing the HTML comment. This can only happen if the
320       * HTML document contains errors (for example, the closing --;gt is
321       * missing. The default method calls the error handler.
322       */
323      protected void handleEOFInComment()
324      {
325        gnu.error("Unclosed comment");
326      }
327    
328      /**
329       * Handle the tag with no content, like &lt;br&gt;. The method is
330       * called for the elements that, in accordance with the current DTD,
331       * has an empty content.
332       * @param tag The tag being handled.
333       * @throws javax.swing.text.ChangedCharSetException
334       */
335      protected void handleEmptyTag(TagElement tag)
336        throws ChangedCharSetException
337      {
338        // This default implementation does nothing.
339      }
340    
341      /**
342       * The method is called when the HTML closing tag ((like &lt;/table&gt;)
343       * is found or if the parser concludes that the one should be present
344       * in the current position.
345       * @param tag The tag being handled
346       */
347      protected void handleEndTag(TagElement tag)
348      {
349        // This default implementation does nothing.
350      }
351    
352      /* Handle error that has occured in the given line. */
353      protected void handleError(int line, String message)
354      {
355        // This default implementation does nothing.
356      }
357    
358      /**
359       * The method is called when the HTML opening tag ((like &lt;table&gt;)
360       * is found or if the parser concludes that the one should be present
361       * in the current position.
362       * @param tag The tag being handled
363       */
364      protected void handleStartTag(TagElement tag)
365      {
366        // This default implementation does nothing.
367      }
368    
369      /**
370       * Handle the text section.
371       * <p> For non-preformatted section, the parser replaces
372       * \t, \r and \n by spaces and then multiple spaces
373       * by a single space. Additionaly, all whitespace around
374       * tags is discarded.
375       * </p>
376       * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
377       * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
378       * if it is present. Additionally, it replaces each occurence of \r or \r\n
379       * by a single \n.</p>
380       *
381       * @param text A section text.
382       */
383      protected void handleText(char[] text)
384      {
385        // This default implementation does nothing.
386      }
387    
388      /**
389       * Handle HTML &lt;title&gt; tag. This method is invoked when
390       * both title starting and closing tags are already behind.
391       * The passed argument contains the concatenation of all
392       * title text sections.
393       * @param title The title text.
394       */
395      protected void handleTitle(char[] title)
396      {
397        // This default implementation does nothing.
398      }
399    
400      /**
401       * Constructs the tag from the given element. In this implementation,
402       * this is defined, but never called.
403       * @param element the base element of the tag.
404       * @return the tag
405       */
406      protected TagElement makeTag(Element element)
407      {
408        return makeTag(element, false);
409      }
410    
411      /**
412       * Constructs the tag from the given element.
413       * @param element the tag base {@link javax.swing.text.html.parser.Element}
414       * @param isSupposed true if the tag is not actually present in the
415       * html input, but the parser supposes that it should to occur in
416       * the current location.
417       * @return the tag
418       */
419      protected TagElement makeTag(Element element, boolean isSupposed)
420      {
421        return new TagElement(element, isSupposed);
422      }
423    
424      /**
425       * This is called when the tag, representing the given element,
426       * occurs first time in the document.
427       * @param element
428       */
429      protected void markFirstTime(Element element)
430      {
431        // This default implementation does nothing.
432      }
433    
434      /**
435       * The method is called when the HTML opening tag ((like &lt;table&gt;)
436       * is found or if the parser concludes that the one should be present
437       * in the current position. The method is called immediately before
438       * calling the handleStartTag.
439       * @param tag The tag
440       */
441      protected void startTag(TagElement tag)
442        throws ChangedCharSetException
443      {
444        // This default implementation does nothing.
445      }
446    }