Libparserutils
inputstream.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
15 
16 #include "input/filter.h"
17 #include "utils/utils.h"
18 
30  uint16_t mibenum;
31  uint32_t encsrc;
37 
41  uint16_t *mibenum, parserutils_buffer *buffer);
42 
60  uint32_t encsrc, parserutils_charset_detect_func csdetect,
61  parserutils_inputstream **stream)
62 {
64  parserutils_error error;
65 
66  if (stream == NULL)
67  return PARSERUTILS_BADPARM;
68 
69  s = malloc(sizeof(parserutils_inputstream_private));
70  if (s == NULL)
71  return PARSERUTILS_NOMEM;
72 
73  error = parserutils_buffer_create(&s->raw);
74  if (error != PARSERUTILS_OK) {
75  free(s);
76  return error;
77  }
78 
80  if (error != PARSERUTILS_OK) {
82  free(s);
83  return error;
84  }
85 
86  s->public.cursor = 0;
87  s->public.had_eof = false;
88  s->done_first_chunk = false;
89 
90  error = parserutils__filter_create("UTF-8", &s->input);
91  if (error != PARSERUTILS_OK) {
94  free(s);
95  return error;
96  }
97 
98  if (enc != NULL) {
100 
101  s->mibenum =
102  parserutils_charset_mibenum_from_name(enc, strlen(enc));
103 
104  if (s->mibenum == 0) {
108  free(s);
110  }
111 
112  params.encoding.name = enc;
113 
116  &params);
117  if (error != PARSERUTILS_OK) {
121  free(s);
122  return error;
123  }
124 
125  s->encsrc = encsrc;
126  } else {
127  s->mibenum = 0;
128  s->encsrc = 0;
129  }
130 
131  s->csdetect = csdetect;
132 
133  *stream = (parserutils_inputstream *) s;
134 
135  return PARSERUTILS_OK;
136 }
137 
145  parserutils_inputstream *stream)
146 {
149 
150  if (stream == NULL)
151  return PARSERUTILS_BADPARM;
152 
156  free(s);
157 
158  return PARSERUTILS_OK;
159 }
160 
170  parserutils_inputstream *stream,
171  const uint8_t *data, size_t len)
172 {
175 
176  if (stream == NULL)
177  return PARSERUTILS_BADPARM;
178 
179  if (data == NULL) {
180  s->public.had_eof = true;
181  return PARSERUTILS_OK;
182  }
183 
184  return parserutils_buffer_append(s->raw, data, len);
185 }
186 
196  parserutils_inputstream *stream,
197  const uint8_t *data, size_t len)
198 {
201 
202  if (stream == NULL || data == NULL)
203  return PARSERUTILS_BADPARM;
204 
206  data, len);
207 }
208 
209 #define IS_ASCII(x) (((x) & 0x80) == 0)
210 
233  parserutils_inputstream *stream,
234  size_t offset, const uint8_t **ptr, size_t *length)
235 {
239  size_t len;
240 
241  if (stream == NULL || ptr == NULL || length == NULL)
242  return PARSERUTILS_BADPARM;
243 
244  /* There's insufficient data in the buffer, so read some more */
245  if (s->raw->length == 0) {
246  /* No more data to be had */
247  return s->public.had_eof ? PARSERUTILS_EOF
249  }
250 
251  /* Refill utf8 buffer from raw buffer */
253  if (error != PARSERUTILS_OK)
254  return error;
255 
256  /* Refill may have succeeded, but not actually produced any new data */
257  if (s->public.cursor + offset == s->public.utf8->length)
258  return PARSERUTILS_NEEDDATA;
259 
260  /* Now try the read */
261  if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
262  len = 1;
263  } else {
265  s->public.utf8->data + s->public.cursor + offset,
266  &len);
267 
268  if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
269  return error;
270 
271  if (error == PARSERUTILS_NEEDDATA) {
272  return s->public.had_eof ? PARSERUTILS_EOF
274  }
275  }
276 
277  (*length) = len;
278  (*ptr) = (s->public.utf8->data + s->public.cursor + offset);
279 
280  return PARSERUTILS_OK;
281 }
282 
283 #undef IS_ASCII
284 
293  parserutils_inputstream *stream, uint32_t *source)
294 {
297 
298  if (stream == NULL || source == NULL)
299  return NULL;
300 
301  *source = s->encsrc;
302 
303  if (s->encsrc == 0)
304  return "UTF-8";
305 
307 }
308 
322  parserutils_inputstream *stream,
323  const char *enc, uint32_t source)
324 {
328  uint16_t temp;
329  parserutils_error error;
330 
331  if (stream == NULL || enc == NULL)
332  return PARSERUTILS_BADPARM;
333 
334  if (s->done_first_chunk)
335  return PARSERUTILS_INVALID;
336 
337  temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
338  if (temp == 0)
340 
341  /* Ensure filter is using the correct encoding */
342  params.encoding.name = enc;
345  &params);
346  if (error != PARSERUTILS_OK)
347  return error;
348 
349  /* Finally, replace the current settings */
350  s->mibenum = temp;
351  s->encsrc = source;
352 
353  return PARSERUTILS_OK;
354 }
355 
356 /******************************************************************************
357  ******************************************************************************/
358 
367 {
368  const uint8_t *raw;
369  uint8_t *utf8;
370  size_t raw_length, utf8_space;
371  parserutils_error error;
372 
373  /* If this is the first chunk of data, we must detect the charset and
374  * strip the BOM, if one exists */
375  if (stream->done_first_chunk == false) {
377 
378  /* If there is a charset detection routine, give it an
379  * opportunity to override any charset specified when the
380  * inputstream was created */
381  if (stream->csdetect != NULL) {
382  error = stream->csdetect(stream->raw->data,
383  stream->raw->length,
384  &stream->mibenum, &stream->encsrc);
385  if (error != PARSERUTILS_OK) {
386  if (error != PARSERUTILS_NEEDDATA ||
387  stream->public.had_eof == false)
388  return error;
389 
390  /* We don't have enough data to detect the
391  * input encoding, but we're not going to get
392  * any more as we've been notified of EOF.
393  * Therefore, leave the encoding alone
394  * so that any charset specified when the
395  * inputstream was created will be preserved.
396  * If there was no charset specified, then
397  * we'll default to UTF-8, below */
398  }
399  }
400 
401  /* Default to UTF-8 if there is still no encoding information
402  * We'll do this if there was no encoding specified up-front
403  * and:
404  * 1) there was no charset detection routine
405  * or 2) there was insufficient data for the charset
406  * detection routine to detect an encoding
407  */
408  if (stream->mibenum == 0) {
409  stream->mibenum =
411  SLEN("UTF-8"));
412  stream->encsrc = 0;
413  }
414 
415  assert(stream->mibenum != 0);
416 
417  /* Strip any BOM, and update encoding as appropriate */
418  error = parserutils_inputstream_strip_bom(&stream->mibenum,
419  stream->raw);
420  if (error != PARSERUTILS_OK)
421  return error;
422 
423  /* Ensure filter is using the correct encoding */
424  params.encoding.name =
426 
427  error = parserutils__filter_setopt(stream->input,
429  &params);
430  if (error != PARSERUTILS_OK)
431  return error;
432 
433  stream->done_first_chunk = true;
434  }
435 
436  /* Work out how to perform the buffer fill */
437  if (stream->public.cursor == stream->public.utf8->length) {
438  /* Cursor's at the end, so simply reuse the entire buffer */
439  utf8 = stream->public.utf8->data;
440  utf8_space = stream->public.utf8->allocated;
441  } else {
442  /* Cursor's not at the end, so shift data after cursor to the
443  * bottom of the buffer. If the buffer's still over half full,
444  * extend it. */
445  memmove(stream->public.utf8->data,
446  stream->public.utf8->data + stream->public.cursor,
447  stream->public.utf8->length - stream->public.cursor);
448 
449  stream->public.utf8->length -= stream->public.cursor;
450 
451  if (stream->public.utf8->length >
452  stream->public.utf8->allocated / 2) {
453  error = parserutils_buffer_grow(stream->public.utf8);
454  if (error != PARSERUTILS_OK)
455  return error;
456  }
457 
458  utf8 = stream->public.utf8->data + stream->public.utf8->length;
459  utf8_space = stream->public.utf8->allocated -
460  stream->public.utf8->length;
461  }
462 
463  raw = stream->raw->data;
464  raw_length = stream->raw->length;
465 
466  /* Try to fill utf8 buffer from the raw data */
467  error = parserutils__filter_process_chunk(stream->input,
468  &raw, &raw_length, &utf8, &utf8_space);
469  /* _NOMEM implies that there's more input to read than available space
470  * in the utf8 buffer. That's fine, so we'll ignore that error. */
471  if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
472  return error;
473 
474  /* Remove the raw data we've processed from the raw buffer */
475  error = parserutils_buffer_discard(stream->raw, 0,
476  stream->raw->length - raw_length);
477  if (error != PARSERUTILS_OK)
478  return error;
479 
480  /* Fix up the utf8 buffer information */
481  stream->public.utf8->length =
482  stream->public.utf8->allocated - utf8_space;
483 
484  /* Finally, fix up the cursor */
485  stream->public.cursor = 0;
486 
487  return PARSERUTILS_OK;
488 }
489 
497  parserutils_buffer *buffer)
498 {
499  static uint16_t utf8;
500  static uint16_t utf16;
501  static uint16_t utf16be;
502  static uint16_t utf16le;
503  static uint16_t utf32;
504  static uint16_t utf32be;
505  static uint16_t utf32le;
506 
507  if (utf8 == 0) {
509  SLEN("UTF-8"));
510  utf16 = parserutils_charset_mibenum_from_name("UTF-16",
511  SLEN("UTF-16"));
512  utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
513  SLEN("UTF-16BE"));
514  utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
515  SLEN("UTF-16LE"));
516  utf32 = parserutils_charset_mibenum_from_name("UTF-32",
517  SLEN("UTF-32"));
518  utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
519  SLEN("UTF-32BE"));
520  utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
521  SLEN("UTF-32LE"));
522  }
523 
524 #define UTF32_BOM_LEN (4)
525 #define UTF16_BOM_LEN (2)
526 #define UTF8_BOM_LEN (3)
527 
528  if (*mibenum == utf8) {
529  if (buffer->length >= UTF8_BOM_LEN &&
530  buffer->data[0] == 0xEF &&
531  buffer->data[1] == 0xBB &&
532  buffer->data[2] == 0xBF) {
534  buffer, 0, UTF8_BOM_LEN);
535  }
536  } else if (*mibenum == utf16be) {
537  if (buffer->length >= UTF16_BOM_LEN &&
538  buffer->data[0] == 0xFE &&
539  buffer->data[1] == 0xFF) {
541  buffer, 0, UTF16_BOM_LEN);
542  }
543  } else if (*mibenum == utf16le) {
544  if (buffer->length >= UTF16_BOM_LEN &&
545  buffer->data[0] == 0xFF &&
546  buffer->data[1] == 0xFE) {
548  buffer, 0, UTF16_BOM_LEN);
549  }
550  } else if (*mibenum == utf16) {
551  *mibenum = utf16be;
552 
553  if (buffer->length >= UTF16_BOM_LEN) {
554  if (buffer->data[0] == 0xFE &&
555  buffer->data[1] == 0xFF) {
557  buffer, 0, UTF16_BOM_LEN);
558  } else if (buffer->data[0] == 0xFF &&
559  buffer->data[1] == 0xFE) {
560  *mibenum = utf16le;
562  buffer, 0, UTF16_BOM_LEN);
563  }
564  }
565  } else if (*mibenum == utf32be) {
566  if (buffer->length >= UTF32_BOM_LEN &&
567  buffer->data[0] == 0x00 &&
568  buffer->data[1] == 0x00 &&
569  buffer->data[2] == 0xFE &&
570  buffer->data[3] == 0xFF) {
572  buffer, 0, UTF32_BOM_LEN);
573  }
574  } else if (*mibenum == utf32le) {
575  if (buffer->length >= UTF32_BOM_LEN &&
576  buffer->data[0] == 0xFF &&
577  buffer->data[1] == 0xFE &&
578  buffer->data[2] == 0x00 &&
579  buffer->data[3] == 0x00) {
581  buffer, 0, UTF32_BOM_LEN);
582  }
583  } else if (*mibenum == utf32) {
584  *mibenum = utf32be;
585 
586  if (buffer->length >= UTF32_BOM_LEN) {
587  if (buffer->data[0] == 0x00 &&
588  buffer->data[1] == 0x00 &&
589  buffer->data[2] == 0xFE &&
590  buffer->data[3] == 0xFF) {
592  buffer, 0, UTF32_BOM_LEN);
593  } else if (buffer->data[0] == 0xFF &&
594  buffer->data[1] == 0xFE &&
595  buffer->data[2] == 0x00 &&
596  buffer->data[3] == 0x00) {
597  *mibenum = utf32le;
599  buffer, 0, UTF32_BOM_LEN);
600  }
601  }
602  }
603 
604 #undef UTF8_BOM_LEN
605 #undef UTF16_BOM_LEN
606 #undef UTF32_BOM_LEN
607 
608  return PARSERUTILS_OK;
609 }
610 
#define SLEN(s)
Definition: utils.h:21
struct parserutils_filter_optparams::@5 encoding
Parameters for encoding setting.
const char * parserutils_charset_mibenum_to_name(uint16_t mibenum)
Retrieve the canonical name of an encoding from the MIB enum.
Definition: aliases.c:127
Input filter option parameters.
Definition: filter.h:28
#define UTF32_BOM_LEN
parserutils_error parserutils__filter_setopt(parserutils_filter *input, parserutils_filter_opttype type, parserutils_filter_optparams *params)
Configure an input filter.
Definition: filter.c:149
bool done_first_chunk
Whether the first chunk has been processed.
Definition: inputstream.c:27
size_t length
Definition: buffer.h:22
parserutils_error parserutils_inputstream_destroy(parserutils_inputstream *stream)
Destroy an input stream.
Definition: inputstream.c:144
parserutils_buffer * raw
Buffer containing raw data.
Definition: inputstream.c:25
parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
Extend the amount of space allocated for a memory buffer.
Definition: buffer.c:150
const char * name
Encoding name.
Definition: filter.h:32
parserutils_error parserutils_inputstream_change_charset(parserutils_inputstream *stream, const char *enc, uint32_t source)
Change the source charset of the input stream.
Definition: inputstream.c:321
parserutils_error parserutils__filter_process_chunk(parserutils_filter *input, const uint8_t **data, size_t *len, uint8_t **output, size_t *outlen)
Process a chunk of data.
Definition: filter.c:179
#define UTF16_BOM_LEN
UTF-8 manipulation functions (interface).
parserutils_error
Definition: errors.h:18
Private input stream definition.
Definition: inputstream.c:22
parserutils_charset_detect_func csdetect
Charset detection func.
Definition: inputstream.c:35
#define IS_ASCII(x)
Definition: inputstream.c:209
parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, size_t *len)
Calculate the length (in bytes) of a UTF-8 character.
Definition: utf8.c:107
uint8_t * data
Definition: buffer.h:21
size_t len
Definition: codec_8859.c:23
parserutils_error parserutils_buffer_create(parserutils_buffer **buffer)
Create a memory buffer.
Definition: buffer.c:22
Input filter.
Definition: filter.c:24
parserutils_error parserutils_inputstream_peek_slow(parserutils_inputstream *stream, size_t offset, const uint8_t **ptr, size_t *length)
Look at the character in the stream that starts at offset bytes from the cursor (slow version)
Definition: inputstream.c:232
static parserutils_error parserutils_inputstream_refill_buffer(parserutils_inputstream_private *stream)
Refill the UTF-8 buffer from the raw buffer.
Definition: inputstream.c:365
parserutils_error parserutils_inputstream_insert(parserutils_inputstream *stream, const uint8_t *data, size_t len)
Insert data into stream at current location.
Definition: inputstream.c:195
struct parserutils_inputstream_private parserutils_inputstream_private
Private input stream definition.
parserutils_error parserutils_inputstream_append(parserutils_inputstream *stream, const uint8_t *data, size_t len)
Append data to an input stream.
Definition: inputstream.c:169
static parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum, parserutils_buffer *buffer)
Strip a BOM from a buffer in the given encoding.
Definition: inputstream.c:496
parserutils_error parserutils_inputstream_create(const char *enc, uint32_t encsrc, parserutils_charset_detect_func csdetect, parserutils_inputstream **stream)
Create an input stream.
Definition: inputstream.c:59
parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, size_t offset, const uint8_t *data, size_t len)
Insert data into a memory buffer.
Definition: buffer.c:97
bool had_eof
Whether EOF has been reached.
Definition: inputstream.h:45
parserutils_inputstream public
Public part.
Definition: inputstream.c:23
uint32_t cursor
Byte offset of current position.
Definition: inputstream.h:43
Input stream object.
Definition: inputstream.h:39
parserutils_error parserutils__filter_destroy(parserutils_filter *input)
Destroy an input filter.
Definition: filter.c:114
parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, size_t offset, size_t len)
Discard a section of a memory buffer.
Definition: buffer.c:130
size_t allocated
Definition: buffer.h:23
parserutils_filter * input
Charset conversion filter.
Definition: inputstream.c:33
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
parserutils_error(* parserutils_charset_detect_func)(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Type of charset detection function.
Definition: inputstream.h:32
const char * parserutils_inputstream_read_charset(parserutils_inputstream *stream, uint32_t *source)
Read the source charset of the input stream.
Definition: inputstream.c:292
parserutils_buffer * utf8
Buffer containing UTF-8 data.
Definition: inputstream.h:41
#define UTF8_BOM_LEN
uint32_t encsrc
Charset source.
Definition: inputstream.c:31
uint16_t mibenum
MIB enum for charset, or 0.
Definition: inputstream.c:30
parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, const uint8_t *data, size_t len)
Append data to a memory buffer.
Definition: buffer.c:72
parserutils_error parserutils__filter_create(const char *int_enc, parserutils_filter **filter)
Create an input filter.
Definition: filter.c:58
parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer)
Destroy a memory buffer.
Definition: buffer.c:53