Hubbub
tokeniser.c
Go to the documentation of this file.
1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
7  */
8 #include <assert.h>
9 #include <stdbool.h>
10 #include <string.h>
11 
12 #include <stdio.h>
13 
14 #include <parserutils/charset/utf8.h>
15 
16 #include "utils/parserutilserror.h"
17 #include "utils/utils.h"
18 
19 #include "hubbub/errors.h"
20 #include "tokeniser/entities.h"
21 #include "tokeniser/tokeniser.h"
22 
26 static const uint32_t cp1252Table[32] = {
27  0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
28  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
29  0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
30  0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
31 };
32 
36 static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
37 static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
38 
39 
43 static const uint8_t lf = '\n';
44 static const hubbub_string lf_str = { &lf, 1 };
45 
46 
50 typedef enum hubbub_tokeniser_state {
95 
99 typedef struct hubbub_tokeniser_context {
100  size_t pending;
109  uint8_t last_start_tag_name[10];
113  struct {
114  uint32_t count;
115  bool match;
116  } close_tag_match;
119  struct {
120  uint32_t count;
121  } match_doctype;
123  struct {
124  uint32_t count;
125  uint32_t end;
126  } match_cdata;
128  struct {
129  size_t offset;
130  uint32_t length;
131  uint32_t codepoint;
132  bool complete;
134  uint32_t poss_length;
137  uint8_t base;
139  int32_t context;
141  size_t prev_len;
143  bool had_data;
145  bool overflow;
150  } match_entity;
152  struct {
153  uint32_t line;
154  uint32_t col;
156  } position;
158  uint32_t allowed_char;
161 
169  bool escape_flag;
171  bool paused;
173  parserutils_inputstream *input;
174  parserutils_buffer *buffer;
175  parserutils_buffer *insert_buf;
180  void *token_pw;
183  void *error_pw;
184 };
185 
188  hubbub_tokeniser *tokeniser);
190  hubbub_tokeniser *tokeniser);
192  hubbub_tokeniser *tokeniser);
194  hubbub_tokeniser *tokeniser);
196  hubbub_tokeniser *tokeniser);
198  hubbub_tokeniser *tokeniser);
200  hubbub_tokeniser *tokeniser);
202  hubbub_tokeniser *tokeniser);
204  hubbub_tokeniser *tokeniser);
206  hubbub_tokeniser *tokeniser);
208  hubbub_tokeniser *tokeniser);
210  hubbub_tokeniser *tokeniser);
212  hubbub_tokeniser *tokeniser);
214  hubbub_tokeniser *tokeniser);
216  hubbub_tokeniser *tokeniser);
218  hubbub_tokeniser *tokeniser);
220  hubbub_tokeniser *tokeniser);
222  hubbub_tokeniser *tokeniser);
224  hubbub_tokeniser *tokeniser);
226  hubbub_tokeniser *tokeniser);
228  hubbub_tokeniser *tokeniser);
230  hubbub_tokeniser *tokeniser);
232  hubbub_tokeniser *tokeniser);
234  hubbub_tokeniser *tokeniser);
236  hubbub_tokeniser *tokeniser);
238  hubbub_tokeniser *tokeniser);
240  hubbub_tokeniser *tokeniser);
242  hubbub_tokeniser *tokeniser);
244  hubbub_tokeniser *tokeniser);
246  hubbub_tokeniser *tokeniser);
248  hubbub_tokeniser *tokeniser);
250  hubbub_tokeniser *tokeniser);
252  hubbub_tokeniser *tokeniser);
254  hubbub_tokeniser *tokeniser);
256  hubbub_tokeniser *tokeniser);
258  hubbub_tokeniser *tokeniser);
260  hubbub_tokeniser *tokeniser, size_t off);
262  hubbub_tokeniser *tokeniser);
264  hubbub_tokeniser *tokeniser);
265 
266 static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
267  const hubbub_string *chars);
268 static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
269 static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
270 static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
271 static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
272  bool force_quirks);
274  hubbub_token *token);
275 
285 hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
286  hubbub_tokeniser **tokeniser)
287 {
288  parserutils_error perror;
289  hubbub_tokeniser *tok;
290 
291  if (input == NULL || tokeniser == NULL)
292  return HUBBUB_BADPARM;
293 
294  tok = malloc(sizeof(hubbub_tokeniser));
295  if (tok == NULL)
296  return HUBBUB_NOMEM;
297 
298  perror = parserutils_buffer_create(&tok->buffer);
299  if (perror != PARSERUTILS_OK) {
300  free(tok);
302  }
303 
304  perror = parserutils_buffer_create(&tok->insert_buf);
305  if (perror != PARSERUTILS_OK) {
306  parserutils_buffer_destroy(tok->buffer);
307  free(tok);
309  }
310 
311  tok->state = STATE_DATA;
313 
314  tok->escape_flag = false;
315  tok->process_cdata_section = false;
316 
317  tok->paused = false;
318 
319  tok->input = input;
320 
321  tok->token_handler = NULL;
322  tok->token_pw = NULL;
323 
324  tok->error_handler = NULL;
325  tok->error_pw = NULL;
326 
327  memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
328 
329  *tokeniser = tok;
330 
331  return HUBBUB_OK;
332 }
333 
341 {
342  if (tokeniser == NULL)
343  return HUBBUB_BADPARM;
344 
345  if (tokeniser->context.current_tag.attributes != NULL) {
346  free(tokeniser->context.current_tag.attributes);
347  }
348 
349  parserutils_buffer_destroy(tokeniser->insert_buf);
350 
351  parserutils_buffer_destroy(tokeniser->buffer);
352 
353  free(tokeniser);
354 
355  return HUBBUB_OK;
356 }
357 
369 {
370  hubbub_error err = HUBBUB_OK;
371 
372  if (tokeniser == NULL || params == NULL)
373  return HUBBUB_BADPARM;
374 
375  switch (type) {
377  tokeniser->token_handler = params->token_handler.handler;
378  tokeniser->token_pw = params->token_handler.pw;
379  break;
381  tokeniser->error_handler = params->error_handler.handler;
382  tokeniser->error_pw = params->error_handler.pw;
383  break;
385  tokeniser->content_model = params->content_model.model;
386  break;
388  tokeniser->process_cdata_section = params->process_cdata;
389  break;
391  if (params->pause_parse == true) {
392  tokeniser->paused = true;
393  } else {
394  if (tokeniser->paused == true) {
395  tokeniser->paused = false;
396  err = hubbub_tokeniser_run(tokeniser);
397  }
398  }
399  }
400 
401  return err;
402 }
403 
416  const uint8_t *data, size_t len)
417 {
418  parserutils_error perror;
419 
420  if (tokeniser == NULL || data == NULL)
421  return HUBBUB_BADPARM;
422 
423  perror = parserutils_buffer_append(tokeniser->insert_buf, data, len);
424  if (perror != PARSERUTILS_OK)
426 
427  return HUBBUB_OK;
428 }
429 
437 {
438  hubbub_error cont = HUBBUB_OK;
439 
440  if (tokeniser == NULL)
441  return HUBBUB_BADPARM;
442 
443  if (tokeniser->paused == true)
444  return HUBBUB_PAUSED;
445 
446 #if 0
447 #define state(x) \
448  case x: \
449  printf( #x "\n");
450 #else
451 #define state(x) \
452  case x:
453 #endif
454 
455  while (cont == HUBBUB_OK) {
456  switch (tokeniser->state) {
458  cont = hubbub_tokeniser_handle_data(tokeniser);
459  break;
462  tokeniser);
463  break;
465  cont = hubbub_tokeniser_handle_tag_open(tokeniser);
466  break;
469  tokeniser);
470  break;
472  cont = hubbub_tokeniser_handle_tag_name(tokeniser);
473  break;
476  tokeniser);
477  break;
480  tokeniser);
481  break;
484  tokeniser);
485  break;
488  tokeniser);
489  break;
492  tokeniser);
493  break;
496  tokeniser);
497  break;
500  tokeniser);
501  break;
504  tokeniser);
505  break;
508  tokeniser);
509  break;
512  tokeniser);
513  break;
516  tokeniser);
517  break;
520  tokeniser);
521  break;
524  tokeniser);
525  break;
526  case STATE_COMMENT_START:
528  case STATE_COMMENT:
530  case STATE_COMMENT_END:
531  cont = hubbub_tokeniser_handle_comment(tokeniser);
532  break;
535  tokeniser);
536  break;
538  cont = hubbub_tokeniser_handle_doctype(tokeniser);
539  break;
542  tokeniser);
543  break;
546  tokeniser);
547  break;
550  tokeniser);
551  break;
552 
555  tokeniser);
556  break;
559  tokeniser);
560  break;
563  tokeniser);
564  break;
567  tokeniser);
568  break;
571  tokeniser);
572  break;
575  tokeniser);
576  break;
579  tokeniser);
580  break;
583  tokeniser);
584  break;
587  tokeniser);
588  break;
591  tokeniser);
592  break;
595  tokeniser);
596  break;
599  tokeniser);
600  break;
603  tokeniser);
604  break;
607  tokeniser);
608  break;
611  tokeniser);
612  break;
613  }
614  }
615 
616  return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
617 }
618 
619 
627 #define START_BUF(str, cptr, length) \
628  do { \
629  parserutils_error perror; \
630  perror = parserutils_buffer_append(tokeniser->buffer, \
631  (uint8_t *) (cptr), (length)); \
632  if (perror != PARSERUTILS_OK) \
633  return hubbub_error_from_parserutils_error(perror); \
634  (str).len = (length); \
635  } while (0)
636 
637 #define COLLECT(str, cptr, length) \
638  do { \
639  parserutils_error perror; \
640  assert(str.len != 0); \
641  perror = parserutils_buffer_append(tokeniser->buffer, \
642  (uint8_t *) (cptr), (length)); \
643  if (perror != PARSERUTILS_OK) \
644  return hubbub_error_from_parserutils_error(perror); \
645  (str).len += (length); \
646  } while (0)
647 
648 #define COLLECT_MS(str, cptr, length) \
649  do { \
650  parserutils_error perror; \
651  perror = parserutils_buffer_append(tokeniser->buffer, \
652  (uint8_t *) (cptr), (length)); \
653  if (perror != PARSERUTILS_OK) \
654  return hubbub_error_from_parserutils_error(perror); \
655  (str).len += (length); \
656  } while (0)
657 
658 
659 /* this should always be called with an empty "chars" buffer */
661 {
662  parserutils_error error;
663  hubbub_token token;
664  const uint8_t *cptr;
665  size_t len;
666 
667  while ((error = parserutils_inputstream_peek(tokeniser->input,
668  tokeniser->context.pending, &cptr, &len)) ==
669  PARSERUTILS_OK) {
670  const uint8_t c = *cptr;
671 
672  if (c == '&' &&
675  tokeniser->escape_flag == false) {
676  tokeniser->state =
678  /* Don't eat the '&'; it'll be handled by entity
679  * consumption */
680  break;
681  } else if (c == '-' &&
682  tokeniser->escape_flag == false &&
683  (tokeniser->content_model ==
685  tokeniser->content_model ==
687  tokeniser->context.pending >= 3) {
688  size_t ignore;
689  error = parserutils_inputstream_peek(
690  tokeniser->input,
691  tokeniser->context.pending - 3,
692  &cptr,
693  &ignore);
694 
695  assert(error == PARSERUTILS_OK);
696 
697  if (strncmp((char *)cptr,
698  "<!--", SLEN("<!--")) == 0) {
699  tokeniser->escape_flag = true;
700  }
701 
702  tokeniser->context.pending += len;
703  } else if (c == '<' && (tokeniser->content_model ==
705  ((tokeniser->content_model ==
707  tokeniser->content_model ==
709  tokeniser->escape_flag == false))) {
710  if (tokeniser->context.pending > 0) {
711  /* Emit any pending characters */
712  emit_current_chars(tokeniser);
713  }
714 
715  /* Buffer '<' */
716  tokeniser->context.pending = len;
717  tokeniser->state = STATE_TAG_OPEN;
718  break;
719  } else if (c == '>' && tokeniser->escape_flag == true &&
720  (tokeniser->content_model ==
722  tokeniser->content_model ==
724  /* no need to check that there are enough characters,
725  * since you can only run into this if the flag is
726  * true in the first place, which requires four
727  * characters. */
728  error = parserutils_inputstream_peek(
729  tokeniser->input,
730  tokeniser->context.pending - 2,
731  &cptr,
732  &len);
733 
734  assert(error == PARSERUTILS_OK);
735 
736  if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
737  tokeniser->escape_flag = false;
738  }
739 
740  tokeniser->context.pending += len;
741  } else if (c == '\0') {
742  if (tokeniser->context.pending > 0) {
743  /* Emit any pending characters */
744  emit_current_chars(tokeniser);
745  }
746 
747  /* Emit a replacement character */
748  emit_character_token(tokeniser, &u_fffd_str);
749 
750  /* Advance past NUL */
751  parserutils_inputstream_advance(tokeniser->input, 1);
752  } else if (c == '\r') {
753  error = parserutils_inputstream_peek(
754  tokeniser->input,
755  tokeniser->context.pending + len,
756  &cptr,
757  &len);
758 
759  if (error != PARSERUTILS_OK &&
760  error != PARSERUTILS_EOF) {
761  break;
762  }
763 
764  if (tokeniser->context.pending > 0) {
765  /* Emit any pending characters */
766  emit_current_chars(tokeniser);
767  }
768 
769  if (error == PARSERUTILS_EOF || *cptr != '\n') {
770  /* Emit newline */
771  emit_character_token(tokeniser, &lf_str);
772  }
773 
774  /* Advance over */
775  parserutils_inputstream_advance(tokeniser->input, 1);
776  } else {
777  /* Just collect into buffer */
778  tokeniser->context.pending += len;
779  }
780  }
781 
782  if (tokeniser->state != STATE_TAG_OPEN &&
783  (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
784  tokeniser->context.pending > 0) {
785  /* Emit any pending characters */
786  emit_current_chars(tokeniser);
787  }
788 
789  if (error == PARSERUTILS_EOF) {
790  token.type = HUBBUB_TOKEN_EOF;
791  hubbub_tokeniser_emit_token(tokeniser, &token);
792  }
793 
794  if (error == PARSERUTILS_EOF) {
795  return HUBBUB_NEEDDATA;
796  } else {
798  }
799 }
800 
801 /* emit any pending tokens before calling */
803  hubbub_tokeniser *tokeniser)
804 {
805  assert(tokeniser->context.pending == 0);
806 
807  if (tokeniser->context.match_entity.complete == false) {
809  tokeniser->context.pending);
810  } else {
811  hubbub_token token;
812 
813  uint8_t utf8[6];
814  uint8_t *utf8ptr = utf8;
815  size_t len = sizeof(utf8);
816 
818 
819  if (tokeniser->context.match_entity.codepoint) {
820  parserutils_charset_utf8_from_ucs4(
821  tokeniser->context.match_entity.codepoint,
822  &utf8ptr, &len);
823 
824  token.data.character.ptr = utf8;
825  token.data.character.len = sizeof(utf8) - len;
826 
827  hubbub_tokeniser_emit_token(tokeniser, &token);
828 
829  /* +1 for ampersand */
830  parserutils_inputstream_advance(tokeniser->input,
831  tokeniser->context.match_entity.length
832  + 1);
833  } else {
834  parserutils_error error;
835  const uint8_t *cptr = NULL;
836 
837  error = parserutils_inputstream_peek(
838  tokeniser->input,
839  tokeniser->context.pending,
840  &cptr,
841  &len);
842  if (error != PARSERUTILS_OK) {
844  error);
845  }
846 
847  token.data.character.ptr = cptr;
848  token.data.character.len = len;
849 
850  hubbub_tokeniser_emit_token(tokeniser, &token);
851  parserutils_inputstream_advance(tokeniser->input, len);
852  }
853 
854  /* Reset for next time */
855  tokeniser->context.match_entity.complete = false;
856 
857  tokeniser->state = STATE_DATA;
858  }
859 
860  return HUBBUB_OK;
861 }
862 
863 /* this state always switches to another state straight away */
864 /* this state expects the current character to be '<' */
866 {
867  hubbub_tag *ctag = &tokeniser->context.current_tag;
868 
869  size_t len;
870  const uint8_t *cptr;
871  parserutils_error error;
872  uint8_t c;
873 
874  assert(tokeniser->context.pending == 1);
875 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
876 
877  error = parserutils_inputstream_peek(tokeniser->input,
878  tokeniser->context.pending, &cptr, &len);
879 
880  if (error != PARSERUTILS_OK) {
881  if (error == PARSERUTILS_EOF) {
882  /* Return to data state with '<' still in "chars" */
883  tokeniser->state = STATE_DATA;
884  return HUBBUB_OK;
885  } else {
887  }
888  }
889 
890  c = *cptr;
891 
892  if (c == '/') {
893  tokeniser->context.pending += len;
894 
895  tokeniser->context.close_tag_match.match = false;
896  tokeniser->context.close_tag_match.count = 0;
897 
898  tokeniser->state = STATE_CLOSE_TAG_OPEN;
899  } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
900  tokeniser->content_model ==
902  /* Return to data state with '<' still in "chars" */
903  tokeniser->state = STATE_DATA;
904  } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
905  if (c == '!') {
906  parserutils_inputstream_advance(tokeniser->input,
907  SLEN("<!"));
908 
909  tokeniser->context.pending = 0;
911  } else if ('A' <= c && c <= 'Z') {
912  uint8_t lc = (c + 0x20);
913 
914  START_BUF(ctag->name, &lc, len);
915  ctag->n_attributes = 0;
916  tokeniser->context.current_tag_type =
918 
919  tokeniser->context.pending += len;
920 
921  tokeniser->state = STATE_TAG_NAME;
922  } else if ('a' <= c && c <= 'z') {
923  START_BUF(ctag->name, cptr, len);
924  ctag->n_attributes = 0;
925  tokeniser->context.current_tag_type =
927 
928  tokeniser->context.pending += len;
929 
930  tokeniser->state = STATE_TAG_NAME;
931  } else if (c == '>') {
934  tokeniser->context.pending += len;
935  tokeniser->state = STATE_DATA;
936  } else if (c == '?') {
939  /* Cursor still at "<", need to advance past it */
940  parserutils_inputstream_advance(
941  tokeniser->input, SLEN("<"));
942  tokeniser->context.pending = 0;
943 
944  tokeniser->state = STATE_BOGUS_COMMENT;
945  } else {
946  /* Return to data state with '<' still in "chars" */
947  tokeniser->state = STATE_DATA;
948  }
949  }
950 
951  return HUBBUB_OK;
952 }
953 
954 /* this state expects tokeniser->context.chars to be "</" */
955 /* this state never stays in this state for more than one character */
957 {
958  hubbub_tokeniser_context *ctx = &tokeniser->context;
959 
960  size_t len;
961  const uint8_t *cptr;
962  parserutils_error error;
963  uint8_t c;
964 
965  assert(tokeniser->context.pending == 2);
966 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
967 /* assert(tokeniser->context.chars.ptr[1] == '/'); */
968 
971  if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
972  tokeniser->content_model ==
974  uint8_t *start_tag_name =
975  tokeniser->context.last_start_tag_name;
976  size_t start_tag_len =
977  tokeniser->context.last_start_tag_len;
978 
979  while ((error = parserutils_inputstream_peek(tokeniser->input,
980  ctx->pending +
981  ctx->close_tag_match.count,
982  &cptr,
983  &len)) == PARSERUTILS_OK) {
984  c = *cptr;
985 
986  if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
987  != (c & ~0x20)) {
988  break;
989  }
990 
991  ctx->close_tag_match.count += len;
992 
993  if (ctx->close_tag_match.count == start_tag_len) {
994  ctx->close_tag_match.match = true;
995  break;
996  }
997  }
998 
999  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1001  }
1002 
1003  if (ctx->close_tag_match.match == true) {
1004  error = parserutils_inputstream_peek(
1005  tokeniser->input,
1006  ctx->pending +
1007  ctx->close_tag_match.count,
1008  &cptr,
1009  &len);
1010 
1011  if (error != PARSERUTILS_OK &&
1012  error != PARSERUTILS_EOF) {
1014  error);
1015  } else if (error != PARSERUTILS_EOF) {
1016  c = *cptr;
1017 
1018  if (c != '\t' && c != '\n' && c != '\f' &&
1019  c != ' ' && c != '>' &&
1020  c != '/') {
1021  ctx->close_tag_match.match = false;
1022  }
1023  }
1024  }
1025  }
1026 
1027  if (ctx->close_tag_match.match == false &&
1028  tokeniser->content_model !=
1030  /* We should emit "</" here, but instead we leave it in the
1031  * buffer so the data state emits it with any characters
1032  * following it */
1033  tokeniser->state = STATE_DATA;
1034  } else {
1035  error = parserutils_inputstream_peek(tokeniser->input,
1036  tokeniser->context.pending, &cptr, &len);
1037 
1038  if (error == PARSERUTILS_EOF) {
1041  /* Return to data state with "</" pending */
1042  tokeniser->state = STATE_DATA;
1043  return HUBBUB_OK;
1044  } else if (error != PARSERUTILS_OK) {
1046  }
1047 
1048  c = *cptr;
1049 
1050  if ('A' <= c && c <= 'Z') {
1051  uint8_t lc = (c + 0x20);
1052  START_BUF(tokeniser->context.current_tag.name,
1053  &lc, len);
1054  tokeniser->context.current_tag.n_attributes = 0;
1055 
1056  tokeniser->context.current_tag_type =
1058 
1059  tokeniser->context.pending += len;
1060 
1061  tokeniser->state = STATE_TAG_NAME;
1062  } else if ('a' <= c && c <= 'z') {
1063  START_BUF(tokeniser->context.current_tag.name,
1064  cptr, len);
1065  tokeniser->context.current_tag.n_attributes = 0;
1066 
1067  tokeniser->context.current_tag_type =
1069 
1070  tokeniser->context.pending += len;
1071 
1072  tokeniser->state = STATE_TAG_NAME;
1073  } else if (c == '>') {
1074  /* Cursor still at "</", need to collect ">" */
1075  tokeniser->context.pending += len;
1076 
1077  /* Now need to advance past "</>" */
1078  parserutils_inputstream_advance(tokeniser->input,
1079  tokeniser->context.pending);
1080  tokeniser->context.pending = 0;
1081 
1083  tokeniser->state = STATE_DATA;
1084  } else {
1087  /* Cursor still at "</", need to advance past it */
1088  parserutils_inputstream_advance(tokeniser->input,
1089  tokeniser->context.pending);
1090  tokeniser->context.pending = 0;
1091 
1092  tokeniser->state = STATE_BOGUS_COMMENT;
1093  }
1094  }
1095 
1096  return HUBBUB_OK;
1097 }
1098 
1099 /* this state expects tokeniser->context.current_tag to already have its
1100  first character set */
1102 {
1103  hubbub_tag *ctag = &tokeniser->context.current_tag;
1104 
1105  size_t len;
1106  const uint8_t *cptr;
1107  parserutils_error error;
1108  uint8_t c;
1109 
1110  assert(tokeniser->context.pending > 0);
1111 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
1112  assert(ctag->name.len > 0);
1113 /* assert(ctag->name.ptr); */
1114 
1115  error = parserutils_inputstream_peek(tokeniser->input,
1116  tokeniser->context.pending, &cptr, &len);
1117 
1118  if (error != PARSERUTILS_OK) {
1119  if (error == PARSERUTILS_EOF) {
1120  tokeniser->state = STATE_DATA;
1121  return emit_current_tag(tokeniser);
1122  } else {
1124  }
1125  }
1126 
1127  c = *cptr;
1128 
1129  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1130  tokeniser->context.pending += len;
1131  tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1132  } else if (c == '>') {
1133  tokeniser->context.pending += len;
1134  tokeniser->state = STATE_DATA;
1135  return emit_current_tag(tokeniser);
1136  } else if (c == '\0') {
1137  COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1138  tokeniser->context.pending += len;
1139  } else if (c == '/') {
1140  tokeniser->context.pending += len;
1141  tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1142  } else if ('A' <= c && c <= 'Z') {
1143  uint8_t lc = (c + 0x20);
1144  COLLECT(ctag->name, &lc, len);
1145  tokeniser->context.pending += len;
1146  } else {
1147  COLLECT(ctag->name, cptr, len);
1148  tokeniser->context.pending += len;
1149  }
1150 
1151  return HUBBUB_OK;
1152 }
1153 
1155  hubbub_tokeniser *tokeniser)
1156 {
1157  hubbub_tag *ctag = &tokeniser->context.current_tag;
1158 
1159  size_t len;
1160  const uint8_t *cptr;
1161  parserutils_error error;
1162  uint8_t c;
1163 
1164  error = parserutils_inputstream_peek(tokeniser->input,
1165  tokeniser->context.pending, &cptr, &len);
1166 
1167  if (error != PARSERUTILS_OK) {
1168  if (error == PARSERUTILS_EOF) {
1169  tokeniser->state = STATE_DATA;
1170  return emit_current_tag(tokeniser);
1171  } else {
1173  }
1174  }
1175 
1176  c = *cptr;
1177 
1178  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1179  /* pass over in silence */
1180  tokeniser->context.pending += len;
1181  } else if (c == '>') {
1182  tokeniser->context.pending += len;
1183  tokeniser->state = STATE_DATA;
1184  return emit_current_tag(tokeniser);
1185  } else if (c == '/') {
1186  tokeniser->context.pending += len;
1187  tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1188  } else {
1189  hubbub_attribute *attr;
1190 
1191  if (c == '"' || c == '\'' || c == '=') {
1193  }
1194 
1195  attr = realloc(ctag->attributes,
1196  (ctag->n_attributes + 1) *
1197  sizeof(hubbub_attribute));
1198  if (attr == NULL)
1199  return HUBBUB_NOMEM;
1200 
1201  ctag->attributes = attr;
1202 
1203  if ('A' <= c && c <= 'Z') {
1204  uint8_t lc = (c + 0x20);
1205  START_BUF(attr[ctag->n_attributes].name, &lc, len);
1206  } else if (c == '\0') {
1207  START_BUF(attr[ctag->n_attributes].name,
1208  u_fffd, sizeof(u_fffd));
1209  } else {
1210  START_BUF(attr[ctag->n_attributes].name, cptr, len);
1211  }
1212 
1213  attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1214  attr[ctag->n_attributes].value.ptr = NULL;
1215  attr[ctag->n_attributes].value.len = 0;
1216 
1217  ctag->n_attributes++;
1218 
1219  tokeniser->context.pending += len;
1220  tokeniser->state = STATE_ATTRIBUTE_NAME;
1221  }
1222 
1223  return HUBBUB_OK;
1224 }
1225 
1227 {
1228  hubbub_tag *ctag = &tokeniser->context.current_tag;
1229 
1230  size_t len;
1231  const uint8_t *cptr;
1232  parserutils_error error;
1233  uint8_t c;
1234 
1235  assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1236 
1237  error = parserutils_inputstream_peek(tokeniser->input,
1238  tokeniser->context.pending, &cptr, &len);
1239 
1240  if (error != PARSERUTILS_OK) {
1241  if (error == PARSERUTILS_EOF) {
1242  tokeniser->state = STATE_DATA;
1243  return emit_current_tag(tokeniser);
1244  } else {
1246  }
1247  }
1248 
1249  c = *cptr;
1250 
1251  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1252  tokeniser->context.pending += len;
1253  tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1254  } else if (c == '=') {
1255  tokeniser->context.pending += len;
1256  tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1257  } else if (c == '>') {
1258  tokeniser->context.pending += len;
1259  tokeniser->state = STATE_DATA;
1260  return emit_current_tag(tokeniser);
1261  } else if (c == '/') {
1262  tokeniser->context.pending += len;
1263  tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1264  } else if (c == '\0') {
1265  COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1266  u_fffd, sizeof(u_fffd));
1267  tokeniser->context.pending += len;
1268  } else if ('A' <= c && c <= 'Z') {
1269  uint8_t lc = (c + 0x20);
1270  COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1271  &lc, len);
1272  tokeniser->context.pending += len;
1273  } else {
1274  COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1275  cptr, len);
1276  tokeniser->context.pending += len;
1277  }
1278 
1279  return HUBBUB_OK;
1280 }
1281 
1283  hubbub_tokeniser *tokeniser)
1284 {
1285  hubbub_tag *ctag = &tokeniser->context.current_tag;
1286 
1287  size_t len;
1288  const uint8_t *cptr;
1289  parserutils_error error;
1290  uint8_t c;
1291 
1292  error = parserutils_inputstream_peek(tokeniser->input,
1293  tokeniser->context.pending, &cptr, &len);
1294 
1295  if (error != PARSERUTILS_OK) {
1296  if (error == PARSERUTILS_EOF) {
1297  tokeniser->state = STATE_DATA;
1298  return emit_current_tag(tokeniser);
1299  } else {
1301  }
1302  }
1303 
1304  c = *cptr;
1305 
1306  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1307  tokeniser->context.pending += len;
1308  } else if (c == '=') {
1309  tokeniser->context.pending += len;
1310  tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1311  } else if (c == '>') {
1312  tokeniser->context.pending += len;
1313 
1314  tokeniser->state = STATE_DATA;
1315  return emit_current_tag(tokeniser);
1316  } else if (c == '/') {
1317  tokeniser->context.pending += len;
1318  tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1319  } else {
1320  hubbub_attribute *attr;
1321 
1322  if (c == '"' || c == '\'') {
1324  }
1325 
1326  attr = realloc(ctag->attributes,
1327  (ctag->n_attributes + 1) *
1328  sizeof(hubbub_attribute));
1329  if (attr == NULL)
1330  return HUBBUB_NOMEM;
1331 
1332  ctag->attributes = attr;
1333 
1334  if ('A' <= c && c <= 'Z') {
1335  uint8_t lc = (c + 0x20);
1336  START_BUF(attr[ctag->n_attributes].name, &lc, len);
1337  } else if (c == '\0') {
1338  START_BUF(attr[ctag->n_attributes].name,
1339  u_fffd, sizeof(u_fffd));
1340  } else {
1341  START_BUF(attr[ctag->n_attributes].name, cptr, len);
1342  }
1343 
1344  attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1345  attr[ctag->n_attributes].value.ptr = NULL;
1346  attr[ctag->n_attributes].value.len = 0;
1347 
1348  ctag->n_attributes++;
1349 
1350  tokeniser->context.pending += len;
1351  tokeniser->state = STATE_ATTRIBUTE_NAME;
1352  }
1353 
1354  return HUBBUB_OK;
1355 }
1356 
1357 /* this state is only ever triggered by an '=' */
1359  hubbub_tokeniser *tokeniser)
1360 {
1361  hubbub_tag *ctag = &tokeniser->context.current_tag;
1362 
1363  size_t len;
1364  const uint8_t *cptr;
1365  parserutils_error error;
1366  uint8_t c;
1367 
1368  error = parserutils_inputstream_peek(tokeniser->input,
1369  tokeniser->context.pending, &cptr, &len);
1370 
1371  if (error != PARSERUTILS_OK) {
1372  if (error == PARSERUTILS_EOF) {
1374  tokeniser->state = STATE_DATA;
1375  return emit_current_tag(tokeniser);
1376  } else {
1378  }
1379  }
1380 
1381  c = *cptr;
1382 
1383  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1384  tokeniser->context.pending += len;
1385  } else if (c == '"') {
1386  tokeniser->context.pending += len;
1387  tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1388  } else if (c == '&') {
1389  /* Don't consume the '&' -- reprocess in UQ state */
1390  tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1391  } else if (c == '\'') {
1392  tokeniser->context.pending += len;
1393  tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1394  } else if (c == '>') {
1396  tokeniser->context.pending += len;
1397 
1398  tokeniser->state = STATE_DATA;
1399  return emit_current_tag(tokeniser);
1400  } else if (c == '\0') {
1401  START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1402  u_fffd, sizeof(u_fffd));
1403  tokeniser->context.pending += len;
1404  tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1405  } else {
1406  if (c == '=') {
1408  }
1409 
1410  START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1411  cptr, len);
1412 
1413  tokeniser->context.pending += len;
1414  tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1415  }
1416 
1417  return HUBBUB_OK;
1418 }
1419 
1421  hubbub_tokeniser *tokeniser)
1422 {
1423  hubbub_tag *ctag = &tokeniser->context.current_tag;
1424 
1425  size_t len;
1426  const uint8_t *cptr;
1427  parserutils_error error;
1428  uint8_t c;
1429 
1430  error = parserutils_inputstream_peek(tokeniser->input,
1431  tokeniser->context.pending, &cptr, &len);
1432 
1433  if (error != PARSERUTILS_OK) {
1434  if (error == PARSERUTILS_EOF) {
1435  tokeniser->state = STATE_DATA;
1436  return emit_current_tag(tokeniser);
1437  } else {
1439  }
1440  }
1441 
1442  c = *cptr;
1443 
1444  if (c == '"') {
1445  tokeniser->context.pending += len;
1446  tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1447  } else if (c == '&') {
1448  tokeniser->context.prev_state = tokeniser->state;
1450  tokeniser->context.allowed_char = '"';
1451  /* Don't eat the '&'; it'll be handled by entity consumption */
1452  } else if (c == '\0') {
1453  COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1454  u_fffd, sizeof(u_fffd));
1455  tokeniser->context.pending += len;
1456  } else if (c == '\r') {
1457  error = parserutils_inputstream_peek(
1458  tokeniser->input,
1459  tokeniser->context.pending + len,
1460  &cptr,
1461  &len);
1462 
1463  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1465  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1466  COLLECT_MS(ctag->attributes[
1467  ctag->n_attributes - 1].value,
1468  &lf, sizeof(lf));
1469  }
1470 
1471  /* Consume '\r' */
1472  tokeniser->context.pending += 1;
1473  } else {
1474  COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1475  cptr, len);
1476  tokeniser->context.pending += len;
1477  }
1478 
1479  return HUBBUB_OK;
1480 }
1481 
1483  hubbub_tokeniser *tokeniser)
1484 {
1485  hubbub_tag *ctag = &tokeniser->context.current_tag;
1486 
1487  size_t len;
1488  const uint8_t *cptr;
1489  parserutils_error error;
1490  uint8_t c;
1491 
1492  error = parserutils_inputstream_peek(tokeniser->input,
1493  tokeniser->context.pending, &cptr, &len);
1494 
1495  if (error != PARSERUTILS_OK) {
1496  if (error == PARSERUTILS_EOF) {
1497  tokeniser->state = STATE_DATA;
1498  return emit_current_tag(tokeniser);
1499  } else {
1501  }
1502  }
1503 
1504  c = *cptr;
1505 
1506  if (c == '\'') {
1507  tokeniser->context.pending += len;
1508  tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1509  } else if (c == '&') {
1510  tokeniser->context.prev_state = tokeniser->state;
1512  tokeniser->context.allowed_char = '\'';
1513  /* Don't eat the '&'; it'll be handled by entity consumption */
1514  } else if (c == '\0') {
1515  COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1516  u_fffd, sizeof(u_fffd));
1517  tokeniser->context.pending += len;
1518  } else if (c == '\r') {
1519  error = parserutils_inputstream_peek(
1520  tokeniser->input,
1521  tokeniser->context.pending + len,
1522  &cptr,
1523  &len);
1524 
1525  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1527  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1528  COLLECT_MS(ctag->attributes[
1529  ctag->n_attributes - 1].value,
1530  &lf, sizeof(lf));
1531  }
1532 
1533  /* Consume \r */
1534  tokeniser->context.pending += 1;
1535  } else {
1536  COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1537  cptr, len);
1538  tokeniser->context.pending += len;
1539  }
1540 
1541  return HUBBUB_OK;
1542 }
1543 
1545  hubbub_tokeniser *tokeniser)
1546 {
1547  hubbub_tag *ctag = &tokeniser->context.current_tag;
1548  uint8_t c;
1549 
1550  size_t len;
1551  const uint8_t *cptr;
1552  parserutils_error error;
1553 
1554  error = parserutils_inputstream_peek(tokeniser->input,
1555  tokeniser->context.pending, &cptr, &len);
1556 
1557  if (error != PARSERUTILS_OK) {
1558  if (error == PARSERUTILS_EOF) {
1559  tokeniser->state = STATE_DATA;
1560  return emit_current_tag(tokeniser);
1561  } else {
1563  }
1564  }
1565 
1566  c = *cptr;
1567 
1568  assert(c == '&' ||
1569  ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1570 
1571  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1572  tokeniser->context.pending += len;
1573  tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1574  } else if (c == '&') {
1575  tokeniser->context.prev_state = tokeniser->state;
1577  /* Don't eat the '&'; it'll be handled by entity consumption */
1578  } else if (c == '>') {
1579  tokeniser->context.pending += len;
1580  tokeniser->state = STATE_DATA;
1581  return emit_current_tag(tokeniser);
1582  } else if (c == '\0') {
1583  COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1584  u_fffd, sizeof(u_fffd));
1585  tokeniser->context.pending += len;
1586  } else {
1587  if (c == '"' || c == '\'' || c == '=') {
1589  }
1590 
1591  COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1592  cptr, len);
1593  tokeniser->context.pending += len;
1594  }
1595 
1596  return HUBBUB_OK;
1597 }
1598 
1600  hubbub_tokeniser *tokeniser)
1601 {
1602  if (tokeniser->context.match_entity.complete == false) {
1604  tokeniser->context.pending);
1605  } else {
1606  hubbub_tag *ctag = &tokeniser->context.current_tag;
1607  hubbub_attribute *attr = &ctag->attributes[
1608  ctag->n_attributes - 1];
1609 
1610  uint8_t utf8[6];
1611  uint8_t *utf8ptr = utf8;
1612  size_t len = sizeof(utf8);
1613 
1614  if (tokeniser->context.match_entity.codepoint) {
1615  parserutils_charset_utf8_from_ucs4(
1616  tokeniser->context.match_entity.codepoint,
1617  &utf8ptr, &len);
1618 
1619  COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1620 
1621  /* +1 for the ampersand */
1622  tokeniser->context.pending +=
1623  tokeniser->context.match_entity.length
1624  + 1;
1625  } else {
1626  size_t len = 0;
1627  const uint8_t *cptr = NULL;
1628  parserutils_error error;
1629 
1630  error = parserutils_inputstream_peek(
1631  tokeniser->input,
1632  tokeniser->context.pending,
1633  &cptr,
1634  &len);
1635  if (error != PARSERUTILS_OK) {
1637  error);
1638  }
1639 
1640  /* Insert the ampersand */
1641  COLLECT_MS(attr->value, cptr, len);
1642  tokeniser->context.pending += len;
1643  }
1644 
1645  /* Reset for next time */
1646  tokeniser->context.match_entity.complete = false;
1647 
1648  /* And back to the previous state */
1649  tokeniser->state = tokeniser->context.prev_state;
1650  }
1651 
1652  return HUBBUB_OK;
1653 }
1654 
1655 /* always switches state */
1657  hubbub_tokeniser *tokeniser)
1658 {
1659  size_t len;
1660  const uint8_t *cptr;
1661  parserutils_error error;
1662  uint8_t c;
1663 
1664  error = parserutils_inputstream_peek(tokeniser->input,
1665  tokeniser->context.pending, &cptr, &len);
1666 
1667  if (error != PARSERUTILS_OK) {
1668  if (error == PARSERUTILS_EOF) {
1669  tokeniser->state = STATE_DATA;
1670  return emit_current_tag(tokeniser);
1671  } else {
1673  }
1674  }
1675 
1676  c = *cptr;
1677 
1678  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1679  tokeniser->context.pending += len;
1680  tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1681  } else if (c == '>') {
1682  tokeniser->context.pending += len;
1683 
1684  tokeniser->state = STATE_DATA;
1685  return emit_current_tag(tokeniser);
1686  } else if (c == '/') {
1687  tokeniser->context.pending += len;
1688  tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1689  } else {
1691  /* Reprocess character in before attribute name state */
1692  tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1693  }
1694 
1695  return HUBBUB_OK;
1696 }
1697 
1699  hubbub_tokeniser *tokeniser)
1700 {
1701  size_t len;
1702  const uint8_t *cptr;
1703  parserutils_error error;
1704  uint8_t c;
1705 
1706  error = parserutils_inputstream_peek(tokeniser->input,
1707  tokeniser->context.pending, &cptr, &len);
1708 
1709  if (error != PARSERUTILS_OK) {
1710  if (error == PARSERUTILS_EOF) {
1711  tokeniser->state = STATE_DATA;
1712  return emit_current_tag(tokeniser);
1713  } else {
1715  }
1716  }
1717 
1718  c = *cptr;
1719 
1720  if (c == '>') {
1721  tokeniser->context.pending += len;
1722  tokeniser->state = STATE_DATA;
1723 
1724  tokeniser->context.current_tag.self_closing = true;
1725  return emit_current_tag(tokeniser);
1726  } else {
1727  /* Reprocess character in before attribute name state */
1728  tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1729  }
1730 
1731  return HUBBUB_OK;
1732 }
1733 
1734 /* this state expects tokeniser->context.chars to be empty on first entry */
1736 {
1737  size_t len;
1738  const uint8_t *cptr;
1739  parserutils_error error;
1740  uint8_t c;
1741 
1742  error = parserutils_inputstream_peek(tokeniser->input,
1743  tokeniser->context.pending, &cptr, &len);
1744 
1745  if (error != PARSERUTILS_OK) {
1746  if (error == PARSERUTILS_EOF) {
1747  tokeniser->state = STATE_DATA;
1748  return emit_current_comment(tokeniser);
1749  } else {
1751  }
1752  }
1753 
1754  c = *cptr;
1755 
1756  if (c == '>') {
1757  tokeniser->context.pending += len;
1758  tokeniser->state = STATE_DATA;
1759  return emit_current_comment(tokeniser);
1760  } else if (c == '\0') {
1761  error = parserutils_buffer_append(tokeniser->buffer,
1762  u_fffd, sizeof(u_fffd));
1763  if (error != PARSERUTILS_OK)
1765 
1766  tokeniser->context.pending += len;
1767  } else if (c == '\r') {
1768  error = parserutils_inputstream_peek(
1769  tokeniser->input,
1770  tokeniser->context.pending,
1771  &cptr,
1772  &len);
1773 
1774  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1776  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1777  error = parserutils_buffer_append(tokeniser->buffer,
1778  &lf, sizeof(lf));
1779  if (error != PARSERUTILS_OK) {
1781  error);
1782  }
1783  }
1784  tokeniser->context.pending += len;
1785  } else {
1786  error = parserutils_buffer_append(tokeniser->buffer,
1787  (uint8_t *) cptr, len);
1788  if (error != PARSERUTILS_OK)
1790 
1791  tokeniser->context.pending += len;
1792  }
1793 
1794  return HUBBUB_OK;
1795 }
1796 
1797 /* this state always switches to another state straight away */
1799  hubbub_tokeniser *tokeniser)
1800 {
1801  size_t len;
1802  const uint8_t *cptr;
1803  parserutils_error error;
1804  uint8_t c;
1805 
1806  assert(tokeniser->context.pending == 0);
1807 
1808  error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1809 
1810  if (error != PARSERUTILS_OK) {
1811  if (error == PARSERUTILS_EOF) {
1812  tokeniser->state = STATE_BOGUS_COMMENT;
1813  return HUBBUB_OK;
1814  } else {
1816  }
1817  }
1818 
1819  c = *cptr;
1820 
1821  if (c == '-') {
1822  tokeniser->context.pending = len;
1823  tokeniser->state = STATE_MATCH_COMMENT;
1824  } else if ((c & ~0x20) == 'D') {
1825  tokeniser->context.pending = len;
1826  tokeniser->context.match_doctype.count = len;
1827  tokeniser->state = STATE_MATCH_DOCTYPE;
1828  } else if (tokeniser->process_cdata_section == true && c == '[') {
1829  tokeniser->context.pending = len;
1830  tokeniser->context.match_cdata.count = len;
1831  tokeniser->state = STATE_MATCH_CDATA;
1832  } else {
1833  tokeniser->state = STATE_BOGUS_COMMENT;
1834  }
1835 
1836  return HUBBUB_OK;
1837 }
1838 
1839 
1841 {
1842  size_t len;
1843  const uint8_t *cptr;
1844  parserutils_error error;
1845 
1846  error = parserutils_inputstream_peek(tokeniser->input,
1847  tokeniser->context.pending, &cptr, &len);
1848 
1849  if (error != PARSERUTILS_OK) {
1850  if (error == PARSERUTILS_EOF) {
1851  tokeniser->context.pending =
1852  tokeniser->context.current_comment.len = 0;
1853  tokeniser->state = STATE_BOGUS_COMMENT;
1854  return HUBBUB_OK;
1855  } else {
1857  }
1858  }
1859 
1860  tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
1861 
1862  if (*cptr == '-') {
1863  parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
1864  tokeniser->state = STATE_COMMENT_START;
1865  } else {
1866  tokeniser->state = STATE_BOGUS_COMMENT;
1867  }
1868 
1869  return HUBBUB_OK;
1870 }
1871 
1872 
1874 {
1875  size_t len;
1876  const uint8_t *cptr;
1877  parserutils_error error;
1878  uint8_t c;
1879 
1880  error = parserutils_inputstream_peek(tokeniser->input,
1881  tokeniser->context.pending, &cptr, &len);
1882 
1883  if (error != PARSERUTILS_OK) {
1884  if (error == PARSERUTILS_EOF) {
1885  tokeniser->state = STATE_DATA;
1886  return emit_current_comment(tokeniser);
1887  } else {
1889  }
1890  }
1891 
1892  c = *cptr;
1893 
1894  if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
1895  tokeniser->state == STATE_COMMENT_START ||
1896  tokeniser->state == STATE_COMMENT_END)) {
1897  tokeniser->context.pending += len;
1898 
1900  tokeniser->state = STATE_DATA;
1901  return emit_current_comment(tokeniser);
1902  } else if (c == '-') {
1903  if (tokeniser->state == STATE_COMMENT_START) {
1904  tokeniser->state = STATE_COMMENT_START_DASH;
1905  } else if (tokeniser->state == STATE_COMMENT_START_DASH) {
1906  tokeniser->state = STATE_COMMENT_END;
1907  } else if (tokeniser->state == STATE_COMMENT) {
1908  tokeniser->state = STATE_COMMENT_END_DASH;
1909  } else if (tokeniser->state == STATE_COMMENT_END_DASH) {
1910  tokeniser->state = STATE_COMMENT_END;
1911  } else if (tokeniser->state == STATE_COMMENT_END) {
1912  error = parserutils_buffer_append(tokeniser->buffer,
1913  (uint8_t *) "-", SLEN("-"));
1914  if (error != PARSERUTILS_OK) {
1916  error);
1917  }
1918  }
1919 
1920  tokeniser->context.pending += len;
1921  } else {
1922  if (tokeniser->state == STATE_COMMENT_START_DASH ||
1923  tokeniser->state == STATE_COMMENT_END_DASH) {
1924  error = parserutils_buffer_append(tokeniser->buffer,
1925  (uint8_t *) "-", SLEN("-"));
1926  if (error != PARSERUTILS_OK) {
1928  error);
1929  }
1930  } else if (tokeniser->state == STATE_COMMENT_END) {
1931  error = parserutils_buffer_append(tokeniser->buffer,
1932  (uint8_t *) "--", SLEN("--"));
1933  if (error != PARSERUTILS_OK) {
1935  error);
1936  }
1937  }
1938 
1939  if (c == '\0') {
1940  error = parserutils_buffer_append(tokeniser->buffer,
1941  u_fffd, sizeof(u_fffd));
1942  if (error != PARSERUTILS_OK) {
1944  error);
1945  }
1946  } else if (c == '\r') {
1947  size_t next_len;
1948  error = parserutils_inputstream_peek(
1949  tokeniser->input,
1950  tokeniser->context.pending + len,
1951  &cptr,
1952  &next_len);
1953  if (error != PARSERUTILS_OK &&
1954  error != PARSERUTILS_EOF) {
1956  error);
1957  } else if (error != PARSERUTILS_EOF && *cptr != '\n') {
1958  error = parserutils_buffer_append(
1959  tokeniser->buffer,
1960  &lf, sizeof(lf));
1961  if (error != PARSERUTILS_OK) {
1963  error);
1964  }
1965  }
1966  } else {
1967  error = parserutils_buffer_append(tokeniser->buffer,
1968  cptr, len);
1969  if (error != PARSERUTILS_OK) {
1971  error);
1972  }
1973  }
1974 
1975  tokeniser->context.pending += len;
1976  tokeniser->state = STATE_COMMENT;
1977  }
1978 
1979  return HUBBUB_OK;
1980 }
1981 
1982 
1983 
1984 
1985 #define DOCTYPE "DOCTYPE"
1986 #define DOCTYPE_LEN (SLEN(DOCTYPE) - 1)
1987 
1989 {
1990  size_t len;
1991  const uint8_t *cptr;
1992  parserutils_error error;
1993  uint8_t c;
1994 
1995  error = parserutils_inputstream_peek(tokeniser->input,
1996  tokeniser->context.match_doctype.count, &cptr, &len);
1997 
1998  if (error != PARSERUTILS_OK) {
1999  if (error == PARSERUTILS_EOF) {
2000  tokeniser->context.current_comment.len =
2001  tokeniser->context.pending = 0;
2002  tokeniser->state = STATE_BOGUS_COMMENT;
2003  return HUBBUB_OK;
2004  } else {
2006  }
2007  }
2008 
2009  c = *cptr;
2010 
2011  assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
2012 
2013  if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2014  tokeniser->context.current_comment.len =
2015  tokeniser->context.pending = 0;
2016  tokeniser->state = STATE_BOGUS_COMMENT;
2017  return HUBBUB_OK;
2018  }
2019 
2020  tokeniser->context.pending += len;
2021 
2022  if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
2023  /* Skip over the DOCTYPE bit */
2024  parserutils_inputstream_advance(tokeniser->input,
2025  tokeniser->context.pending);
2026 
2027  memset(&tokeniser->context.current_doctype, 0,
2028  sizeof tokeniser->context.current_doctype);
2029  tokeniser->context.current_doctype.public_missing = true;
2030  tokeniser->context.current_doctype.system_missing = true;
2031  tokeniser->context.pending = 0;
2032 
2033  tokeniser->state = STATE_DOCTYPE;
2034  }
2035 
2036  tokeniser->context.match_doctype.count++;
2037 
2038  return HUBBUB_OK;
2039 }
2040 
2041 #undef DOCTYPE
2042 #undef DOCTYPE_LEN
2043 
2045 {
2046  size_t len;
2047  const uint8_t *cptr;
2048  parserutils_error error;
2049  uint8_t c;
2050 
2051  error = parserutils_inputstream_peek(tokeniser->input,
2052  tokeniser->context.pending, &cptr, &len);
2053 
2054  if (error != PARSERUTILS_OK) {
2055  if (error == PARSERUTILS_EOF) {
2056  tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2057  return HUBBUB_OK;
2058  } else {
2060  }
2061  }
2062 
2063  c = *cptr;
2064 
2065  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2066  tokeniser->context.pending += len;
2067  }
2068 
2069  tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2070 
2071  return HUBBUB_OK;
2072 }
2073 
2075  hubbub_tokeniser *tokeniser)
2076 {
2077  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2078  size_t len;
2079  const uint8_t *cptr;
2080  parserutils_error error;
2081  uint8_t c;
2082 
2083  error = parserutils_inputstream_peek(tokeniser->input,
2084  tokeniser->context.pending, &cptr, &len);
2085 
2086  if (error != PARSERUTILS_OK) {
2087  if (error == PARSERUTILS_EOF) {
2089  /* Emit current doctype, force-quirks on */
2090  tokeniser->state = STATE_DATA;
2091  return emit_current_doctype(tokeniser, true);
2092  } else {
2094  }
2095  }
2096 
2097  c = *cptr;
2098 
2099  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2100  /* pass over in silence */
2101  tokeniser->context.pending += len;
2102  } else if (c == '>') {
2104  tokeniser->context.pending += len;
2105  tokeniser->state = STATE_DATA;
2106  return emit_current_doctype(tokeniser, true);
2107  } else {
2108  if (c == '\0') {
2109  START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
2110  } else if ('A' <= c && c <= 'Z') {
2111  uint8_t lc = c + 0x20;
2112 
2113  START_BUF(cdoc->name, &lc, len);
2114  } else {
2115  START_BUF(cdoc->name, cptr, len);
2116  }
2117 
2118  tokeniser->context.pending += len;
2119  tokeniser->state = STATE_DOCTYPE_NAME;
2120  }
2121 
2122  return HUBBUB_OK;
2123 }
2124 
2126 {
2127  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2128  size_t len;
2129  const uint8_t *cptr;
2130  parserutils_error error;
2131  uint8_t c;
2132 
2133  error = parserutils_inputstream_peek(tokeniser->input,
2134  tokeniser->context.pending, &cptr, &len);
2135 
2136  if (error != PARSERUTILS_OK) {
2137  if (error == PARSERUTILS_EOF) {
2138  tokeniser->state = STATE_DATA;
2139  return emit_current_doctype(tokeniser, true);
2140  } else {
2142  }
2143  }
2144 
2145  c = *cptr;
2146 
2147  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2148  tokeniser->context.pending += len;
2149  tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
2150  } else if (c == '>') {
2151  tokeniser->context.pending += len;
2152  tokeniser->state = STATE_DATA;
2153  return emit_current_doctype(tokeniser, false);
2154  } else if (c == '\0') {
2155  COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
2156  tokeniser->context.pending += len;
2157  } else if ('A' <= c && c <= 'Z') {
2158  uint8_t lc = c + 0x20;
2159  COLLECT(cdoc->name, &lc, len);
2160  tokeniser->context.pending += len;
2161  } else {
2162  COLLECT(cdoc->name, cptr, len);
2163  tokeniser->context.pending += len;
2164  }
2165 
2166  return HUBBUB_OK;
2167 }
2168 
2170  hubbub_tokeniser *tokeniser)
2171 {
2172  size_t len;
2173  const uint8_t *cptr;
2174  parserutils_error error;
2175  uint8_t c;
2176 
2177  error = parserutils_inputstream_peek(tokeniser->input,
2178  tokeniser->context.pending, &cptr, &len);
2179 
2180  if (error != PARSERUTILS_OK) {
2181  if (error == PARSERUTILS_EOF) {
2182  tokeniser->state = STATE_DATA;
2183  return emit_current_doctype(tokeniser, true);
2184  } else {
2186  }
2187  }
2188 
2189  c = *cptr;
2190  tokeniser->context.pending += len;
2191 
2192  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2193  /* pass over in silence */
2194  } else if (c == '>') {
2195  tokeniser->state = STATE_DATA;
2196  return emit_current_doctype(tokeniser, false);
2197  } else if ((c & ~0x20) == 'P') {
2198  tokeniser->context.match_doctype.count = 1;
2199  tokeniser->state = STATE_MATCH_PUBLIC;
2200  } else if ((c & ~0x20) == 'S') {
2201  tokeniser->context.match_doctype.count = 1;
2202  tokeniser->state = STATE_MATCH_SYSTEM;
2203  } else {
2204  tokeniser->state = STATE_BOGUS_DOCTYPE;
2205  tokeniser->context.current_doctype.force_quirks = true;
2206  }
2207 
2208  return HUBBUB_OK;
2209 }
2210 
2211 #define PUBLIC "PUBLIC"
2212 #define PUBLIC_LEN (SLEN(PUBLIC) - 1)
2213 
2215 {
2216  size_t len;
2217  const uint8_t *cptr;
2218  parserutils_error error;
2219  uint8_t c;
2220 
2221  error = parserutils_inputstream_peek(tokeniser->input,
2222  tokeniser->context.pending, &cptr, &len);
2223 
2224  if (error != PARSERUTILS_OK) {
2225  if (error == PARSERUTILS_EOF) {
2226  tokeniser->context.current_doctype.force_quirks = true;
2227  tokeniser->state = STATE_BOGUS_DOCTYPE;
2228  return HUBBUB_OK;
2229  } else {
2231  }
2232  }
2233 
2234  c = *cptr;
2235 
2236  assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
2237 
2238  if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2239  tokeniser->context.current_doctype.force_quirks = true;
2240  tokeniser->state = STATE_BOGUS_DOCTYPE;
2241  return HUBBUB_OK;
2242  }
2243 
2244  tokeniser->context.pending += len;
2245 
2246  if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
2247  tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
2248  }
2249 
2250  tokeniser->context.match_doctype.count++;
2251 
2252  return HUBBUB_OK;
2253 }
2254 
2255 #undef PUBLIC
2256 #undef PUBLIC_LEN
2257 
2259  hubbub_tokeniser *tokeniser)
2260 {
2261  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2262  size_t len;
2263  const uint8_t *cptr;
2264  parserutils_error error;
2265  uint8_t c;
2266 
2267  error = parserutils_inputstream_peek(tokeniser->input,
2268  tokeniser->context.pending, &cptr, &len);
2269 
2270  if (error != PARSERUTILS_OK) {
2271  if (error == PARSERUTILS_EOF) {
2272  tokeniser->state = STATE_DATA;
2273  return emit_current_doctype(tokeniser, true);
2274  } else {
2276  }
2277  }
2278 
2279  c = *cptr;
2280  tokeniser->context.pending += len;
2281 
2282  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2283  /* pass over in silence */
2284  } else if (c == '"') {
2285  cdoc->public_missing = false;
2286  cdoc->public_id.len = 0;
2287  tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
2288  } else if (c == '\'') {
2289  cdoc->public_missing = false;
2290  cdoc->public_id.len = 0;
2291  tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
2292  } else if (c == '>') {
2293  tokeniser->state = STATE_DATA;
2294  return emit_current_doctype(tokeniser, true);
2295  } else {
2296  cdoc->force_quirks = true;
2297  tokeniser->state = STATE_BOGUS_DOCTYPE;
2298  }
2299 
2300  return HUBBUB_OK;
2301 }
2302 
2304  hubbub_tokeniser *tokeniser)
2305 {
2306  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2307  size_t len;
2308  const uint8_t *cptr;
2309  parserutils_error error;
2310  uint8_t c;
2311 
2312  error = parserutils_inputstream_peek(tokeniser->input,
2313  tokeniser->context.pending, &cptr, &len);
2314 
2315  if (error != PARSERUTILS_OK) {
2316  if (error == PARSERUTILS_EOF) {
2317  tokeniser->state = STATE_DATA;
2318  return emit_current_doctype(tokeniser, true);
2319  } else {
2321  }
2322  }
2323 
2324  c = *cptr;
2325 
2326  if (c == '"') {
2327  tokeniser->context.pending += len;
2328  tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2329  } else if (c == '>') {
2330  tokeniser->context.pending += len;
2331  tokeniser->state = STATE_DATA;
2332  return emit_current_doctype(tokeniser, true);
2333  } else if (c == '\0') {
2334  COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2335  tokeniser->context.pending += len;
2336  } else if (c == '\r') {
2337  error = parserutils_inputstream_peek(
2338  tokeniser->input,
2339  tokeniser->context.pending,
2340  &cptr,
2341  &len);
2342 
2343  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2345  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2346  COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2347  }
2348 
2349  /* Collect '\r' */
2350  tokeniser->context.pending += 1;
2351  } else {
2352  COLLECT_MS(cdoc->public_id, cptr, len);
2353 
2354  tokeniser->context.pending += len;
2355  }
2356 
2357  return HUBBUB_OK;
2358 }
2359 
2361  hubbub_tokeniser *tokeniser)
2362 {
2363  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2364  size_t len;
2365  const uint8_t *cptr;
2366  parserutils_error error;
2367  uint8_t c;
2368 
2369  error = parserutils_inputstream_peek(tokeniser->input,
2370  tokeniser->context.pending, &cptr, &len);
2371 
2372  if (error != PARSERUTILS_OK) {
2373  if (error == PARSERUTILS_EOF) {
2374  tokeniser->state = STATE_DATA;
2375  return emit_current_doctype(tokeniser, true);
2376  } else {
2378  }
2379  }
2380 
2381  c = *cptr;
2382 
2383  if (c == '\'') {
2384  tokeniser->context.pending += len;
2385  tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2386  } else if (c == '>') {
2387  tokeniser->context.pending += len;
2388  tokeniser->state = STATE_DATA;
2389  return emit_current_doctype(tokeniser, true);
2390  } else if (c == '\0') {
2391  COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2392  tokeniser->context.pending += len;
2393  } else if (c == '\r') {
2394  error = parserutils_inputstream_peek(
2395  tokeniser->input,
2396  tokeniser->context.pending,
2397  &cptr,
2398  &len);
2399 
2400  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2402  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2403  COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2404  }
2405 
2406  /* Collect '\r' */
2407  tokeniser->context.pending += 1;
2408  } else {
2409  COLLECT_MS(cdoc->public_id, cptr, len);
2410  tokeniser->context.pending += len;
2411  }
2412 
2413  return HUBBUB_OK;
2414 }
2415 
2416 
2418  hubbub_tokeniser *tokeniser)
2419 {
2420  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2421  size_t len;
2422  const uint8_t *cptr;
2423  parserutils_error error;
2424  uint8_t c;
2425 
2426  error = parserutils_inputstream_peek(tokeniser->input,
2427  tokeniser->context.pending, &cptr, &len);
2428 
2429  if (error != PARSERUTILS_OK) {
2430  if (error == PARSERUTILS_EOF) {
2431  tokeniser->state = STATE_DATA;
2432  return emit_current_doctype(tokeniser, true);
2433  } else {
2435  }
2436  }
2437 
2438  c = *cptr;
2439  tokeniser->context.pending += len;
2440 
2441  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2442  /* pass over in silence */
2443  } else if (c == '"') {
2444  cdoc->system_missing = false;
2445  cdoc->system_id.len = 0;
2446 
2447  tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2448  } else if (c == '\'') {
2449  cdoc->system_missing = false;
2450  cdoc->system_id.len = 0;
2451 
2452  tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2453  } else if (c == '>') {
2454  tokeniser->state = STATE_DATA;
2455  return emit_current_doctype(tokeniser, false);
2456  } else {
2457  cdoc->force_quirks = true;
2458  tokeniser->state = STATE_BOGUS_DOCTYPE;
2459  }
2460 
2461  return HUBBUB_OK;
2462 }
2463 
2464 
2465 
2466 #define SYSTEM "SYSTEM"
2467 #define SYSTEM_LEN (SLEN(SYSTEM) - 1)
2468 
2470 {
2471  size_t len;
2472  const uint8_t *cptr;
2473  parserutils_error error;
2474  uint8_t c;
2475 
2476  error = parserutils_inputstream_peek(tokeniser->input,
2477  tokeniser->context.pending, &cptr, &len);
2478 
2479  if (error != PARSERUTILS_OK){
2480  if (error == PARSERUTILS_EOF) {
2481  tokeniser->context.current_doctype.force_quirks = true;
2482  tokeniser->state = STATE_BOGUS_DOCTYPE;
2483  return HUBBUB_OK;
2484  } else {
2486  }
2487  }
2488 
2489  c = *cptr;
2490 
2491  assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
2492 
2493  if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2494  tokeniser->context.current_doctype.force_quirks = true;
2495  tokeniser->state = STATE_BOGUS_DOCTYPE;
2496  return HUBBUB_OK;
2497  }
2498 
2499  tokeniser->context.pending += len;
2500 
2501  if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
2502  tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
2503  }
2504 
2505  tokeniser->context.match_doctype.count++;
2506 
2507  return HUBBUB_OK;
2508 }
2509 
2510 #undef SYSTEM
2511 #undef SYSTEM_LEN
2512 
2514  hubbub_tokeniser *tokeniser)
2515 {
2516  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2517  size_t len;
2518  const uint8_t *cptr;
2519  parserutils_error error;
2520  uint8_t c;
2521 
2522  error = parserutils_inputstream_peek(tokeniser->input,
2523  tokeniser->context.pending, &cptr, &len);
2524 
2525  if (error != PARSERUTILS_OK) {
2526  if (error == PARSERUTILS_EOF) {
2527  tokeniser->state = STATE_DATA;
2528  return emit_current_doctype(tokeniser, true);
2529  } else {
2531  }
2532  }
2533 
2534  c = *cptr;
2535  tokeniser->context.pending += len;
2536 
2537  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2538  /* pass over */
2539  } else if (c == '"') {
2540  cdoc->system_missing = false;
2541  cdoc->system_id.len = 0;
2542 
2543  tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2544  } else if (c == '\'') {
2545  cdoc->system_missing = false;
2546  cdoc->system_id.len = 0;
2547 
2548  tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2549  } else if (c == '>') {
2550  tokeniser->state = STATE_DATA;
2551  return emit_current_doctype(tokeniser, true);
2552  } else {
2553  cdoc->force_quirks = true;
2554  tokeniser->state = STATE_BOGUS_DOCTYPE;
2555  }
2556 
2557  return HUBBUB_OK;
2558 }
2559 
2561  hubbub_tokeniser *tokeniser)
2562 {
2563  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2564  size_t len;
2565  const uint8_t *cptr;
2566  parserutils_error error;
2567  uint8_t c;
2568 
2569  error = parserutils_inputstream_peek(tokeniser->input,
2570  tokeniser->context.pending, &cptr, &len);
2571 
2572  if (error != PARSERUTILS_OK) {
2573  if (error == PARSERUTILS_EOF) {
2574  tokeniser->state = STATE_DATA;
2575  return emit_current_doctype(tokeniser, true);
2576  } else {
2578  }
2579  }
2580 
2581  c = *cptr;
2582 
2583  if (c == '"') {
2584  tokeniser->context.pending += len;
2585  tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2586  } else if (c == '>') {
2587  tokeniser->context.pending += len;
2588  tokeniser->state = STATE_DATA;
2589  return emit_current_doctype(tokeniser, true);
2590  } else if (c == '\0') {
2591  COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2592  tokeniser->context.pending += len;
2593  } else if (c == '\r') {
2594  error = parserutils_inputstream_peek(
2595  tokeniser->input,
2596  tokeniser->context.pending,
2597  &cptr,
2598  &len);
2599 
2600  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2602  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2603  COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2604  }
2605 
2606  /* Collect '\r' */
2607  tokeniser->context.pending += 1;
2608  } else {
2609  COLLECT_MS(cdoc->system_id, cptr, len);
2610  tokeniser->context.pending += len;
2611  }
2612 
2613  return HUBBUB_OK;
2614 }
2615 
2617  hubbub_tokeniser *tokeniser)
2618 {
2619  hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2620  size_t len;
2621  const uint8_t *cptr;
2622  parserutils_error error;
2623  uint8_t c;
2624 
2625  error = parserutils_inputstream_peek(tokeniser->input,
2626  tokeniser->context.pending, &cptr, &len);
2627 
2628  if (error != PARSERUTILS_OK) {
2629  if (error == PARSERUTILS_EOF) {
2630  tokeniser->state = STATE_DATA;
2631  return emit_current_doctype(tokeniser, true);
2632  } else {
2634  }
2635  }
2636 
2637  c = *cptr;
2638 
2639  if (c == '\'') {
2640  tokeniser->context.pending += len;
2641  tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2642  } else if (c == '>') {
2643  tokeniser->context.pending += len;
2644  tokeniser->state = STATE_DATA;
2645  return emit_current_doctype(tokeniser, true);
2646  } else if (c == '\0') {
2647  COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2648  tokeniser->context.pending += len;
2649  } else if (c == '\r') {
2650  error = parserutils_inputstream_peek(
2651  tokeniser->input,
2652  tokeniser->context.pending,
2653  &cptr,
2654  &len);
2655 
2656  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2658  } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2659  COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2660  }
2661 
2662  /* Collect '\r' */
2663  tokeniser->context.pending += 1;
2664  } else {
2665  COLLECT_MS(cdoc->system_id, cptr, len);
2666  tokeniser->context.pending += len;
2667  }
2668 
2669  return HUBBUB_OK;
2670 }
2671 
2673  hubbub_tokeniser *tokeniser)
2674 {
2675  size_t len;
2676  const uint8_t *cptr;
2677  parserutils_error error;
2678  uint8_t c;
2679 
2680  error = parserutils_inputstream_peek(tokeniser->input,
2681  tokeniser->context.pending, &cptr, &len);
2682 
2683  if (error != PARSERUTILS_OK) {
2684  if (error == PARSERUTILS_EOF) {
2685  tokeniser->state = STATE_DATA;
2686  return emit_current_doctype(tokeniser, true);
2687  } else {
2689  }
2690  }
2691 
2692  c = *cptr;
2693  tokeniser->context.pending += len;
2694 
2695  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2696  /* pass over in silence */
2697  } else if (c == '>') {
2698  tokeniser->state = STATE_DATA;
2699  return emit_current_doctype(tokeniser, false);
2700  } else {
2701  tokeniser->state = STATE_BOGUS_DOCTYPE;
2702  }
2703 
2704  return HUBBUB_OK;
2705 }
2706 
2707 
2709 {
2710  size_t len;
2711  const uint8_t *cptr;
2712  parserutils_error error;
2713  uint8_t c;
2714 
2715  error = parserutils_inputstream_peek(tokeniser->input,
2716  tokeniser->context.pending, &cptr, &len);
2717 
2718  if (error != PARSERUTILS_OK) {
2719  if (error == PARSERUTILS_EOF) {
2720  tokeniser->state = STATE_DATA;
2721  return emit_current_doctype(tokeniser, false);
2722  } else {
2724  }
2725  }
2726 
2727  c = *cptr;
2728  tokeniser->context.pending += len;
2729 
2730  if (c == '>') {
2731  tokeniser->state = STATE_DATA;
2732  return emit_current_doctype(tokeniser, false);
2733  }
2734 
2735  return HUBBUB_OK;
2736 }
2737 
2738 
2739 
2740 #define CDATA "[CDATA["
2741 #define CDATA_LEN (SLEN(CDATA) - 1)
2742 
2744 {
2745  size_t len;
2746  const uint8_t *cptr;
2747  parserutils_error error;
2748  uint8_t c;
2749 
2750  error = parserutils_inputstream_peek(tokeniser->input,
2751  tokeniser->context.pending, &cptr, &len);
2752 
2753  if (error != PARSERUTILS_OK) {
2754  if (error == PARSERUTILS_EOF) {
2755  tokeniser->context.current_comment.len =
2756  tokeniser->context.pending = 0;
2757  tokeniser->state = STATE_BOGUS_COMMENT;
2758  return HUBBUB_OK;
2759  } else {
2761  }
2762  }
2763 
2764  c = *cptr;
2765 
2766  assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
2767 
2768  if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
2769  tokeniser->context.current_comment.len =
2770  tokeniser->context.pending =
2771  0;
2772  tokeniser->state = STATE_BOGUS_COMMENT;
2773  return HUBBUB_OK;
2774  }
2775 
2776  tokeniser->context.pending += len;
2777 
2778  if (tokeniser->context.match_cdata.count == CDATA_LEN) {
2779  parserutils_inputstream_advance(tokeniser->input,
2780  tokeniser->context.match_cdata.count + len);
2781  tokeniser->context.pending = 0;
2782  tokeniser->context.match_cdata.end = 0;
2783  tokeniser->state = STATE_CDATA_BLOCK;
2784  }
2785 
2786  tokeniser->context.match_cdata.count += len;
2787 
2788  return HUBBUB_OK;
2789 }
2790 
2791 #undef CDATA
2792 #undef CDATA_LEN
2793 
2794 
2796 {
2797  size_t len;
2798  const uint8_t *cptr;
2799  parserutils_error error;
2800  uint8_t c;
2801 
2802  error = parserutils_inputstream_peek(tokeniser->input,
2803  tokeniser->context.pending, &cptr, &len);
2804 
2805  if (error != PARSERUTILS_OK) {
2806  if (error == PARSERUTILS_EOF) {
2807  tokeniser->state = STATE_DATA;
2808  return emit_current_chars(tokeniser);
2809  } else {
2811  }
2812  }
2813 
2814  c = *cptr;
2815 
2816  if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
2817  tokeniser->context.match_cdata.end == 1)) {
2818  tokeniser->context.pending += len;
2819  tokeniser->context.match_cdata.end += len;
2820  } else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
2821  /* Remove the previous two "]]" */
2822  tokeniser->context.pending -= 2;
2823 
2824  /* Emit any pending characters */
2825  emit_current_chars(tokeniser);
2826 
2827  /* Now move past the "]]>" bit */
2828  parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
2829 
2830  tokeniser->state = STATE_DATA;
2831  } else if (c == '\0') {
2832  if (tokeniser->context.pending > 0) {
2833  /* Emit any pending characters */
2834  emit_current_chars(tokeniser);
2835  }
2836 
2837  /* Perform NUL-byte replacement */
2838  emit_character_token(tokeniser, &u_fffd_str);
2839 
2840  parserutils_inputstream_advance(tokeniser->input, len);
2841  tokeniser->context.match_cdata.end = 0;
2842  } else if (c == '\r') {
2843  error = parserutils_inputstream_peek(
2844  tokeniser->input,
2845  tokeniser->context.pending + len,
2846  &cptr,
2847  &len);
2848 
2849  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2851  }
2852 
2853  if (tokeniser->context.pending > 0) {
2854  /* Emit any pending characters */
2855  emit_current_chars(tokeniser);
2856  }
2857 
2858  if (error == PARSERUTILS_EOF || *cptr != '\n') {
2859  /* Emit newline */
2860  emit_character_token(tokeniser, &lf_str);
2861  }
2862 
2863  /* Advance over \r */
2864  parserutils_inputstream_advance(tokeniser->input, 1);
2865  tokeniser->context.match_cdata.end = 0;
2866  } else {
2867  tokeniser->context.pending += len;
2868  tokeniser->context.match_cdata.end = 0;
2869  }
2870 
2871  return HUBBUB_OK;
2872 }
2873 
2874 
2876  hubbub_tokeniser *tokeniser, size_t pos)
2877 {
2878  uint32_t allowed_char = tokeniser->context.allowed_char;
2879 
2880  size_t len;
2881  const uint8_t *cptr;
2882  parserutils_error error;
2883  uint8_t c;
2884  size_t off;
2885 
2886  error = parserutils_inputstream_peek(tokeniser->input, pos,
2887  &cptr, &len);
2888 
2889  /* We should always start on an ampersand */
2890  assert(error == PARSERUTILS_OK);
2891  assert(len == 1 && *cptr == '&');
2892 
2893  off = pos + len;
2894 
2895  /* Look at the character after the ampersand */
2896  error = parserutils_inputstream_peek(tokeniser->input, off,
2897  &cptr, &len);
2898 
2899  if (error != PARSERUTILS_OK) {
2900  if (error == PARSERUTILS_EOF) {
2901  tokeniser->context.match_entity.complete = true;
2902  tokeniser->context.match_entity.codepoint = 0;
2903  return HUBBUB_OK;
2904  } else {
2906  }
2907  }
2908 
2909  c = *cptr;
2910 
2911  /* Set things up */
2912  tokeniser->context.match_entity.offset = off;
2913  tokeniser->context.match_entity.poss_length = 0;
2914  tokeniser->context.match_entity.length = 0;
2915  tokeniser->context.match_entity.base = 0;
2916  tokeniser->context.match_entity.codepoint = 0;
2917  tokeniser->context.match_entity.had_data = false;
2918  tokeniser->context.match_entity.return_state = tokeniser->state;
2919  tokeniser->context.match_entity.complete = false;
2920  tokeniser->context.match_entity.overflow = false;
2921  tokeniser->context.match_entity.context = -1;
2922  tokeniser->context.match_entity.prev_len = len;
2923 
2924  /* Reset allowed character for future calls */
2925  tokeniser->context.allowed_char = '\0';
2926 
2927  if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
2928  c == '<' || c == '&' ||
2929  (allowed_char && c == allowed_char)) {
2930  tokeniser->context.match_entity.complete = true;
2931  tokeniser->context.match_entity.codepoint = 0;
2932  } else if (c == '#') {
2933  tokeniser->context.match_entity.length += len;
2934  tokeniser->state = STATE_NUMBERED_ENTITY;
2935  } else {
2936  tokeniser->state = STATE_NAMED_ENTITY;
2937  }
2938 
2939  return HUBBUB_OK;
2940 }
2941 
2942 
2944  hubbub_tokeniser *tokeniser)
2945 {
2946  hubbub_tokeniser_context *ctx = &tokeniser->context;
2947 
2948  size_t len;
2949  const uint8_t *cptr;
2950  parserutils_error error;
2951 
2952  error = parserutils_inputstream_peek(tokeniser->input,
2953  ctx->match_entity.offset + ctx->match_entity.length,
2954  &cptr, &len);
2955 
2956  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2958  }
2959 
2960  if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
2961  uint8_t c = *cptr;
2962  if ((c & ~0x20) == 'X') {
2963  ctx->match_entity.base = 16;
2964  ctx->match_entity.length += len;
2965  } else {
2966  ctx->match_entity.base = 10;
2967  }
2968  }
2969 
2970  while ((error = parserutils_inputstream_peek(tokeniser->input,
2971  ctx->match_entity.offset + ctx->match_entity.length,
2972  &cptr, &len)) == PARSERUTILS_OK) {
2973  uint8_t c = *cptr;
2974 
2975  if (ctx->match_entity.base == 10 &&
2976  ('0' <= c && c <= '9')) {
2977  ctx->match_entity.had_data = true;
2978  ctx->match_entity.codepoint =
2979  ctx->match_entity.codepoint * 10 + (c - '0');
2980 
2981  ctx->match_entity.length += len;
2982  } else if (ctx->match_entity.base == 16 &&
2983  (('0' <= c && c <= '9') ||
2984  ('A' <= (c & ~0x20) &&
2985  (c & ~0x20) <= 'F'))) {
2986  ctx->match_entity.had_data = true;
2987  ctx->match_entity.codepoint *= 16;
2988 
2989  if ('0' <= c && c <= '9') {
2990  ctx->match_entity.codepoint += (c - '0');
2991  } else {
2992  ctx->match_entity.codepoint +=
2993  ((c & ~0x20) - 'A' + 10);
2994  }
2995 
2996  ctx->match_entity.length += len;
2997  } else {
2998  break;
2999  }
3000 
3001  if (ctx->match_entity.codepoint >= 0x10FFFF) {
3002  ctx->match_entity.overflow = true;
3003  }
3004  }
3005 
3006  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3008  }
3009 
3010  /* Eat trailing semicolon, if any */
3011  if (error != PARSERUTILS_EOF && *cptr == ';') {
3012  ctx->match_entity.length += len;
3013  }
3014 
3015  /* Had data, so calculate final codepoint */
3016  if (ctx->match_entity.had_data) {
3017  uint32_t cp = ctx->match_entity.codepoint;
3018 
3019  if (0x80 <= cp && cp <= 0x9F) {
3020  cp = cp1252Table[cp - 0x80];
3021  } else if (cp == 0x0D) {
3022  cp = 0x000A;
3023  } else if (ctx->match_entity.overflow ||
3024  cp <= 0x0008 || cp == 0x000B ||
3025  (0x000E <= cp && cp <= 0x001F) ||
3026  (0x007F <= cp && cp <= 0x009F) ||
3027  (0xD800 <= cp && cp <= 0xDFFF) ||
3028  (0xFDD0 <= cp && cp <= 0xFDEF) ||
3029  (cp & 0xFFFE) == 0xFFFE) {
3030  /* the check for cp > 0x10FFFF per spec is performed
3031  * in the loop above to avoid overflow */
3032  cp = 0xFFFD;
3033  }
3034 
3035  ctx->match_entity.codepoint = cp;
3036  }
3037 
3038  /* Flag completion */
3039  ctx->match_entity.complete = true;
3040 
3041  /* And back to the state we were entered in */
3042  tokeniser->state = ctx->match_entity.return_state;
3043 
3044  return HUBBUB_OK;
3045 }
3046 
3048 {
3049  hubbub_tokeniser_context *ctx = &tokeniser->context;
3050 
3051  size_t len;
3052  const uint8_t *cptr;
3053  parserutils_error error;
3054 
3055  while ((error = parserutils_inputstream_peek(tokeniser->input,
3056  ctx->match_entity.offset +
3058  &cptr, &len)) == PARSERUTILS_OK) {
3059  uint32_t cp;
3060 
3061  uint8_t c = *cptr;
3062  hubbub_error error;
3063 
3064  if (c > 0x7F) {
3065  /* Entity names are ASCII only */
3066  break;
3067  }
3068 
3069  error = hubbub_entities_search_step(c, &cp,
3070  &ctx->match_entity.context);
3071  if (error == HUBBUB_OK) {
3072  /* Had a match - store it for later */
3073  ctx->match_entity.codepoint = cp;
3074 
3075  ctx->match_entity.length =
3076  ctx->match_entity.poss_length + len;
3077  ctx->match_entity.poss_length =
3078  ctx->match_entity.length;
3079  } else if (error == HUBBUB_INVALID) {
3080  /* No further matches - use last found */
3081  break;
3082  } else {
3083  /* Need more data */
3084  ctx->match_entity.poss_length += len;
3085  }
3086  }
3087 
3088  if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3090  }
3091 
3092  if (ctx->match_entity.length > 0) {
3093  uint8_t c;
3094  error = parserutils_inputstream_peek(tokeniser->input,
3095  ctx->match_entity.offset +
3096  ctx->match_entity.length - 1,
3097  &cptr, &len);
3098  /* We're re-reading a character we've already read after.
3099  * Therefore, there's no way that an error may occur as
3100  * a result. */
3101  assert(error == PARSERUTILS_OK);
3102 
3103  c = *cptr;
3104 
3105  if ((tokeniser->context.match_entity.return_state ==
3107  c != ';') {
3108  error = parserutils_inputstream_peek(tokeniser->input,
3109  ctx->match_entity.offset +
3110  ctx->match_entity.length,
3111  &cptr, &len);
3112  /* We must have attempted to read one more character
3113  * than was present in the entity name, as that is the
3114  * only way to break out of the loop above. If that
3115  * failed, then any non-EOF case will have been handled
3116  * by the if statement after the loop thus it cannot
3117  * occur here. */
3118  assert(error == PARSERUTILS_OK ||
3119  error == PARSERUTILS_EOF);
3120 
3121  if (error == PARSERUTILS_EOF) {
3122  ctx->match_entity.codepoint = 0;
3123  }
3124 
3125  c = *cptr;
3126  if ((0x0030 <= c && c <= 0x0039) ||
3127  (0x0041 <= c && c <= 0x005A) ||
3128  (0x0061 <= c && c <= 0x007A)) {
3129  ctx->match_entity.codepoint = 0;
3130  }
3131  }
3132  }
3133 
3134  /* Flag completion */
3135  ctx->match_entity.complete = true;
3136 
3137  /* And back to the state from whence we came */
3138  tokeniser->state = ctx->match_entity.return_state;
3139 
3140  return HUBBUB_OK;
3141 }
3142 
3143 
3144 
3145 /*** Token emitting bits ***/
3146 
3155  const hubbub_string *chars)
3156 {
3157  hubbub_token token;
3158 
3159  token.type = HUBBUB_TOKEN_CHARACTER;
3160  token.data.character = *chars;
3161 
3162  return hubbub_tokeniser_emit_token(tokeniser, &token);
3163 }
3164 
3172 {
3173  hubbub_token token;
3174  size_t len;
3175  const uint8_t *cptr = NULL;
3176  parserutils_error error;
3177 
3178  /* Calling this with nothing to output is a probable bug */
3179  assert(tokeniser->context.pending > 0);
3180 
3181  error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
3182  if (error != PARSERUTILS_OK)
3184 
3185  token.type = HUBBUB_TOKEN_CHARACTER;
3186  token.data.character.ptr = cptr;
3187  token.data.character.len = tokeniser->context.pending;
3188 
3189  return hubbub_tokeniser_emit_token(tokeniser, &token);
3190 }
3191 
3199 {
3200  hubbub_error err;
3201  hubbub_token token;
3202  uint32_t n_attributes;
3203  hubbub_attribute *attrs;
3204  uint8_t *ptr;
3205  uint32_t i, j;
3206 
3207  /* Emit current tag */
3208  token.type = tokeniser->context.current_tag_type;
3209  token.data.tag = tokeniser->context.current_tag;
3210  token.data.tag.ns = HUBBUB_NS_HTML;
3211 
3212 
3213  n_attributes = token.data.tag.n_attributes;
3214  attrs = token.data.tag.attributes;
3215 
3216  /* Set pointers correctly... */
3217  ptr = tokeniser->buffer->data;
3218  token.data.tag.name.ptr = tokeniser->buffer->data;
3219  ptr += token.data.tag.name.len;
3220 
3221  for (i = 0; i < n_attributes; i++) {
3222  attrs[i].name.ptr = ptr;
3223  ptr += attrs[i].name.len;
3224  attrs[i].value.ptr = ptr;
3225  ptr += attrs[i].value.len;
3226  }
3227 
3228 
3229  /* Discard duplicate attributes */
3230  for (i = 0; i < n_attributes; i++) {
3231  for (j = 0; j < n_attributes; j++) {
3232  uint32_t move;
3233 
3234  if (j == i ||
3235  attrs[i].name.len !=
3236  attrs[j].name.len ||
3237  strncmp((char *) attrs[i].name.ptr,
3238  (char *) attrs[j].name.ptr,
3239  attrs[i].name.len) != 0) {
3240  /* Attributes don't match */
3241  continue;
3242  }
3243 
3244  assert(i < j);
3245 
3246  /* Calculate amount to move */
3247  move = (n_attributes - 1 - j) *
3248  sizeof(hubbub_attribute);
3249 
3250  if (move > 0) {
3251  memmove(&attrs[j],&attrs[j+1], move);
3252  }
3253 
3254  /* We've deleted an item, so we need to
3255  * reprocess this index */
3256  j--;
3257 
3258  /* And reduce the number of attributes */
3259  n_attributes--;
3260  }
3261  }
3262 
3263  token.data.tag.n_attributes = n_attributes;
3264 
3265  err = hubbub_tokeniser_emit_token(tokeniser, &token);
3266 
3267  if (token.type == HUBBUB_TOKEN_START_TAG) {
3268  /* Save start tag name for R?CDATA */
3269  if (token.data.tag.name.len <
3270  sizeof(tokeniser->context.last_start_tag_name)) {
3271  strncpy((char *) tokeniser->context.last_start_tag_name,
3272  (const char *) token.data.tag.name.ptr,
3273  token.data.tag.name.len);
3274  tokeniser->context.last_start_tag_len =
3275  token.data.tag.name.len;
3276  } else {
3277  tokeniser->context.last_start_tag_name[0] = '\0';
3278  tokeniser->context.last_start_tag_len = 0;
3279  }
3280  } else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
3281  /* Reset content model after R?CDATA elements */
3283  }
3284 
3285  /* Reset the self-closing flag */
3286  tokeniser->context.current_tag.self_closing = false;
3287 
3288  return err;
3289 }
3290 
3298 {
3299  hubbub_token token;
3300 
3301  token.type = HUBBUB_TOKEN_COMMENT;
3302  token.data.comment.ptr = tokeniser->buffer->data;
3303  token.data.comment.len = tokeniser->buffer->length;
3304 
3305  return hubbub_tokeniser_emit_token(tokeniser, &token);
3306 }
3307 
3316  bool force_quirks)
3317 {
3318  hubbub_token token;
3319 
3320  /* Emit doctype */
3321  token.type = HUBBUB_TOKEN_DOCTYPE;
3322  token.data.doctype = tokeniser->context.current_doctype;
3323  if (force_quirks == true)
3324  token.data.doctype.force_quirks = true;
3325 
3326  /* Set pointers correctly */
3327  token.data.doctype.name.ptr = tokeniser->buffer->data;
3328 
3329  if (token.data.doctype.public_missing == false) {
3330  token.data.doctype.public_id.ptr = tokeniser->buffer->data +
3331  token.data.doctype.name.len;
3332  }
3333 
3334  if (token.data.doctype.system_missing == false) {
3335  token.data.doctype.system_id.ptr = tokeniser->buffer->data +
3336  token.data.doctype.name.len +
3337  token.data.doctype.public_id.len;
3338  }
3339 
3340  return hubbub_tokeniser_emit_token(tokeniser, &token);
3341 }
3342 
3350  hubbub_token *token)
3351 {
3352  hubbub_error err = HUBBUB_OK;
3353 
3354  assert(tokeniser != NULL);
3355  assert(token != NULL);
3356  assert(tokeniser->insert_buf->length == 0);
3357 
3358 #ifndef NDEBUG
3359  /* Sanity checks */
3360  switch (token->type) {
3361  case HUBBUB_TOKEN_DOCTYPE:
3362  assert(memchr(token->data.doctype.name.ptr, 0xff,
3363  token->data.doctype.name.len) == NULL);
3364  if (token->data.doctype.public_missing == false)
3365  assert(memchr(token->data.doctype.public_id.ptr, 0xff,
3366  token->data.doctype.public_id.len) == NULL);
3367  if (token->data.doctype.system_missing == false)
3368  assert(memchr(token->data.doctype.system_id.ptr, 0xff,
3369  token->data.doctype.system_id.len) == NULL);
3370  break;
3372  case HUBBUB_TOKEN_END_TAG:
3373  {
3374  uint32_t i;
3375  assert(memchr(token->data.tag.name.ptr, 0xff,
3376  token->data.tag.name.len) == NULL);
3377  for (i = 0; i < token->data.tag.n_attributes; i++) {
3378  hubbub_attribute *attr = &token->data.tag.attributes[i];
3379 
3380  assert(memchr(attr->name.ptr, 0xff, attr->name.len) ==
3381  NULL);
3382  assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
3383  NULL);
3384  }
3385  }
3386  break;
3387  case HUBBUB_TOKEN_COMMENT:
3388  assert(memchr(token->data.comment.ptr, 0xff,
3389  token->data.comment.len) == NULL);
3390  break;
3392  assert(memchr(token->data.character.ptr, 0xff,
3393  token->data.character.len) == NULL);
3394  break;
3395  case HUBBUB_TOKEN_EOF:
3396  break;
3397  }
3398 #endif
3399 
3400  /* Emit the token */
3401  if (tokeniser->token_handler) {
3402  err = tokeniser->token_handler(token, tokeniser->token_pw);
3403  }
3404 
3405  /* Discard current buffer */
3406  if (tokeniser->buffer->length) {
3407  parserutils_buffer_discard(tokeniser->buffer, 0,
3408  tokeniser->buffer->length);
3409  }
3410 
3411  /* Advance the pointer */
3412  if (tokeniser->context.pending) {
3413  parserutils_inputstream_advance(tokeniser->input,
3414  tokeniser->context.pending);
3415  tokeniser->context.pending = 0;
3416  }
3417 
3418  if (tokeniser->insert_buf->length > 0) {
3419  parserutils_inputstream_insert(tokeniser->input,
3420  tokeniser->insert_buf->data,
3421  tokeniser->insert_buf->length);
3422  parserutils_buffer_discard(tokeniser->insert_buf, 0,
3423  tokeniser->insert_buf->length);
3424  }
3425 
3426  /* Ensure callback can pause the tokenise */
3427  if (err == HUBBUB_PAUSED) {
3428  tokeniser->paused = true;
3429  }
3430 
3431  return err;
3432 }
#define SLEN(s)
Definition: utils.h:34
hubbub_doctype current_doctype
Current doctype.
Definition: tokeniser.c:106
static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1482
static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1544
hubbub_token_handler handler
Definition: tokeniser.h:38
hubbub_token_type type
The token type.
Definition: types.h:120
struct hubbub_tokeniser_optparams::@11 content_model
Current content model.
uint32_t line
Current line of input.
Definition: tokeniser.c:153
hubbub_ns ns
Tag namespace.
Definition: types.h:109
Data for a tag.
Definition: types.h:108
Token data.
Definition: types.h:119
hubbub_tokeniser_state prev_state
Previous state.
Definition: tokeniser.c:107
hubbub_content_model model
Definition: tokeniser.h:48
static hubbub_error hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2513
struct hubbub_tokeniser_context::@7 match_entity
Entity matching state.
static hubbub_error hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2169
parserutils_inputstream * input
Input stream.
Definition: tokeniser.c:173
uint32_t poss_length
Optimistic length when matching named character references.
Definition: tokeniser.c:134
hubbub_string name
Tag name.
Definition: types.h:110
const uint8_t * ptr
Pointer to data.
Definition: types.h:77
struct hubbub_tokeniser_optparams::@10 error_handler
Error handling callback.
#define PUBLIC_LEN
Definition: tokeniser.c:2212
hubbub_tokeniser_state return_state
State we were called from.
Definition: tokeniser.c:148
void * error_pw
Error handler data.
Definition: tokeniser.c:183
Tokeniser string type.
Definition: types.h:76
Data for doctype token.
Definition: types.h:93
static hubbub_error hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1282
hubbub_string value
Attribute value.
Definition: types.h:87
uint32_t allowed_char
Used for quote matching.
Definition: tokeniser.c:158
static hubbub_error hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2258
bool had_data
Whether we read anything after &#(x)?
Definition: tokeniser.c:143
hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, hubbub_tokeniser_opttype type, hubbub_tokeniser_optparams *params)
Configure a hubbub tokeniser.
Definition: tokeniser.c:366
static hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1226
static const uint8_t lf
String for when we want to emit newlines.
Definition: tokeniser.c:43
bool escape_flag
Escape flag.
Definition: tokeniser.c:169
hubbub_string public_id
Doctype public identifier.
Definition: types.h:97
size_t last_start_tag_len
Length of last start tag.
Definition: tokeniser.c:111
uint32_t count
Index into "DOCTYPE".
Definition: tokeniser.c:114
hubbub_string name
Attribute name.
Definition: types.h:86
Tokeniser data structure.
Definition: tokeniser.c:165
static hubbub_error hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser, size_t off)
Definition: tokeniser.c:2875
bool system_missing
Whether the system id is missing.
Definition: types.h:99
static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:660
uint32_t length
Length of entity.
Definition: tokeniser.c:130
uint32_t col
Current character in line.
Definition: tokeniser.c:154
hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, int32_t *context)
Step-wise search for an entity in the dictionary.
Definition: entities.c:103
static hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1735
Hubbub tokeniser option parameters.
Definition: tokeniser.h:36
static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2560
hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
Destroy a hubbub tokeniser.
Definition: tokeniser.c:340
hubbub_error(* hubbub_token_handler)(const hubbub_token *token, void *pw)
Type of token handling function.
Definition: functypes.h:29
static hubbub_error hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:802
#define SYSTEM_LEN
Definition: tokeniser.c:2467
hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser, const uint8_t *data, size_t len)
Insert a chunk of data into the input stream.
Definition: tokeniser.c:415
parserutils_buffer * insert_buf
Stream insertion buffer.
Definition: tokeniser.c:175
static hubbub_error hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2417
hubbub_doctype doctype
Definition: types.h:123
struct hubbub_tokeniser_optparams::@9 token_handler
Token handling callback.
hubbub_token_handler token_handler
Token handling callback.
Definition: tokeniser.c:179
size_t len
Byte length of string.
Definition: types.h:78
static const hubbub_string lf_str
Definition: tokeniser.c:44
static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1656
static hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
Emit the current tag token being stored in the tokeniser context.
Definition: tokeniser.c:3198
static hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2125
static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1420
static hubbub_error hubbub_tokeniser_handle_before_attribute_value(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1358
tokenisation is paused
Definition: errors.h:22
static hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1988
bool self_closing
Whether the tag can have children.
Definition: types.h:113
static hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2214
static hubbub_error hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2943
static hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2469
struct hubbub_tokeniser_context::@6 match_cdata
State for matching cdata.
parserutils_buffer * buffer
Input buffer.
Definition: tokeniser.c:174
static hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1840
bool pause_parse
Pause parsing.
Definition: tokeniser.h:53
static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1798
const char * name
Definition: initial.c:22
bool process_cdata
Whether to process CDATA sections.
Definition: tokeniser.h:51
hubbub_token_type
Type of an emitted token.
Definition: types.h:51
uint8_t base
Base for numeric entities.
Definition: tokeniser.c:137
static hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2795
int32_t context
Context for named entity search.
Definition: tokeniser.c:139
bool complete
True if match complete.
Definition: tokeniser.c:132
hubbub_attribute * attributes
Array of attribute data.
Definition: types.h:112
static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2360
#define START_BUF(str, cptr, length)
Various macros for manipulating buffers.
Definition: tokeniser.c:627
#define DOCTYPE_LEN
Definition: tokeniser.c:1986
hubbub_error
Definition: errors.h:18
static hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, bool force_quirks)
Emit the current doctype token being stored in the tokeniser context.
Definition: tokeniser.c:3315
hubbub_string system_id
Doctype system identifier.
Definition: types.h:100
Context for tokeniser.
Definition: tokeniser.c:99
static hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
Emit the current pending characters being stored in the tokeniser context.
Definition: tokeniser.c:3171
size_t prev_len
Previous byte length of str.
Definition: tokeniser.c:141
hubbub_string comment
Definition: types.h:127
static hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:956
#define CDATA_LEN
Definition: tokeniser.c:2741
hubbub_tokeniser_state state
Current tokeniser state.
Definition: tokeniser.c:166
hubbub_string name
Doctype name.
Definition: types.h:94
struct hubbub_tokeniser_context hubbub_tokeniser_context
Context for tokeniser.
void(* hubbub_error_handler)(uint32_t line, uint32_t col, const char *message, void *pw)
Type of parse error handling function.
Definition: functypes.h:40
#define CDATA
Definition: tokeniser.c:2740
static hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:3047
bool overflow
Whether this entity has has overflowed the maximum numeric entity value.
Definition: tokeniser.c:145
hubbub_string character
Definition: types.h:129
hubbub_error_handler error_handler
Error handling callback.
Definition: tokeniser.c:182
#define DOCTYPE
Definition: tokeniser.c:1985
uint8_t last_start_tag_name[10]
Name of the last start tag emitted.
Definition: tokeniser.c:109
hubbub_string current_comment
Current comment text.
Definition: tokeniser.c:102
hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, hubbub_tokeniser **tokeniser)
Create a hubbub tokeniser.
Definition: tokeniser.c:285
static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1698
element_type type
Definition: treebuilder.c:25
static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, hubbub_token *token)
Emit a token, performing sanity checks if necessary.
Definition: tokeniser.c:3349
static hubbub_error hubbub_tokeniser_handle_before_attribute_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1154
hubbub_content_model
Content model flag.
Definition: types.h:32
No error.
Definition: errors.h:19
size_t len
Definition: initial.c:23
uint32_t codepoint
UCS4 codepoint.
Definition: tokeniser.c:131
bool paused
flag for if parsing is currently paused
Definition: tokeniser.c:171
#define COLLECT_MS(str, cptr, length)
Definition: tokeniser.c:648
hubbub_tag tag
Definition: types.h:125
hubbub_tokeniser_state
Tokeniser states.
Definition: tokeniser.c:50
#define state(x)
struct hubbub_tokeniser_context::@5 match_doctype
State for matching doctype.
Tag attribute data.
Definition: types.h:84
#define COLLECT(str, cptr, length)
Definition: tokeniser.c:637
void * token_pw
Token handler data.
Definition: tokeniser.c:180
static const uint32_t cp1252Table[32]
Table of mappings between Windows-1252 codepoints 128-159 and UCS4.
Definition: tokeniser.c:26
size_t offset
Offset in buffer.
Definition: tokeniser.c:129
union hubbub_token::@3 data
Type-specific data.
static const hubbub_string u_fffd_str
Definition: tokeniser.c:37
hubbub_ns ns
Attribute namespace.
Definition: types.h:85
size_t pending
Count of pending chars.
Definition: tokeniser.c:100
static hubbub_error hubbub_error_from_parserutils_error(parserutils_error error)
Convert a ParserUtils error into a Hubbub error.
static hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, const hubbub_string *chars)
Emit a character token.
Definition: tokeniser.c:3154
static hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
Emit the current comment token being stored in the tokeniser context.
Definition: tokeniser.c:3297
hubbub_tokeniser_opttype
Hubbub tokeniser option types.
Definition: tokeniser.h:25
static hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2708
static hubbub_error hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2672
bool process_cdata_section
Whether to process CDATA sections.
Definition: tokeniser.c:170
static hubbub_error hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2074
hubbub_tag current_tag
Current tag.
Definition: tokeniser.c:105
static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2616
uint32_t end
Index into "]]>".
Definition: tokeniser.c:125
static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1599
struct hubbub_tokeniser_context::@8 position
Position in source data.
hubbub_content_model content_model
Current content model flag.
Definition: tokeniser.c:167
static hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1873
#define SYSTEM
Definition: tokeniser.c:2466
static hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:1101
struct hubbub_tokeniser_context::@4 close_tag_match
State for matching close tags.
hubbub_tokeniser_context context
Tokeniser context.
Definition: tokeniser.c:177
static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2303
hubbub_token_type current_tag_type
Type of current_tag.
Definition: tokeniser.c:104
static hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2044
#define PUBLIC
Definition: tokeniser.c:2211
static const uint8_t u_fffd[3]
UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER.
Definition: tokeniser.c:36
hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
Process remaining data in the input stream.
Definition: tokeniser.c:436
bool force_quirks
Doctype force-quirks flag.
Definition: types.h:102
static hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:865
static hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
Definition: tokeniser.c:2743
uint32_t n_attributes
Count of attributes.
Definition: types.h:111
bool public_missing
Whether the public id is missing.
Definition: types.h:96