Libparserutils
codec_ascii.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
13 
15 #include "utils/endian.h"
16 #include "utils/utils.h"
17 
21 typedef struct charset_ascii_codec {
24 #define READ_BUFSIZE (8)
25  uint32_t read_buf[READ_BUFSIZE];
28  size_t read_len;
30 #define WRITE_BUFSIZE (8)
34  size_t write_len;
37 
38 static bool charset_ascii_codec_handles_charset(const char *charset);
40  const char *charset, parserutils_charset_codec **codec);
45  const uint8_t **source, size_t *sourcelen,
46  uint8_t **dest, size_t *destlen);
49  const uint8_t **source, size_t *sourcelen,
50  uint8_t **dest, size_t *destlen);
55  const uint8_t **source, size_t *sourcelen,
56  uint8_t **dest, size_t *destlen);
59  uint32_t ucs4, uint8_t **dest, size_t *destlen);
61  uint32_t ucs4, uint8_t **s, size_t *len);
63  const uint8_t *s, size_t len, uint32_t *ucs4);
64 
71 bool charset_ascii_codec_handles_charset(const char *charset)
72 {
73  static uint16_t ascii;
74  uint16_t match = parserutils_charset_mibenum_from_name(charset,
75  strlen(charset));
76 
77  if (ascii == 0) {
79  "US-ASCII", SLEN("US-ASCII"));
80  }
81 
82  if (ascii != 0 && ascii == match)
83  return true;
84 
85  return false;
86 }
87 
99 {
101 
102  UNUSED(charset);
103 
104  c = malloc(sizeof(charset_ascii_codec));
105  if (c == NULL)
106  return PARSERUTILS_NOMEM;
107 
108  c->read_buf[0] = 0;
109  c->read_len = 0;
110 
111  c->write_buf[0] = 0;
112  c->write_len = 0;
113 
114  /* Finally, populate vtable */
119 
120  *codec = (parserutils_charset_codec *) c;
121 
122  return PARSERUTILS_OK;
123 }
124 
132 {
133  UNUSED(codec);
134 
135  return PARSERUTILS_OK;
136 }
137 
166  const uint8_t **source, size_t *sourcelen,
167  uint8_t **dest, size_t *destlen)
168 {
170  uint32_t ucs4;
171  uint32_t *towrite;
172  size_t towritelen;
173  parserutils_error error;
174 
175  /* Process any outstanding characters from the previous call */
176  if (c->write_len > 0) {
177  uint32_t *pwrite = c->write_buf;
178 
179  while (c->write_len > 0) {
180  error = charset_ascii_from_ucs4(c, pwrite[0],
181  dest, destlen);
182  if (error != PARSERUTILS_OK) {
183  uint32_t len;
184  assert(error == PARSERUTILS_NOMEM);
185 
186  for (len = 0; len < c->write_len; len++) {
187  c->write_buf[len] = pwrite[len];
188  }
189 
190  return error;
191  }
192 
193  pwrite++;
194  c->write_len--;
195  }
196  }
197 
198  /* Now process the characters for this call */
199  while (*sourcelen > 0) {
200  ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
201  towrite = &ucs4;
202  towritelen = 1;
203 
204  /* Output current characters */
205  while (towritelen > 0) {
206  error = charset_ascii_from_ucs4(c, towrite[0], dest,
207  destlen);
208  if (error != PARSERUTILS_OK) {
209  uint32_t len;
210  if (error != PARSERUTILS_NOMEM) {
211  return error;
212  }
213 
214  /* Insufficient output space */
215  assert(towritelen < WRITE_BUFSIZE);
216 
217  c->write_len = towritelen;
218 
219  /* Copy pending chars to save area, for
220  * processing next call. */
221  for (len = 0; len < towritelen; len++)
222  c->write_buf[len] = towrite[len];
223 
224  /* Claim character we've just buffered,
225  * so it's not reprocessed */
226  *source += 4;
227  *sourcelen -= 4;
228 
229  return PARSERUTILS_NOMEM;
230  }
231 
232  towrite++;
233  towritelen--;
234  }
235 
236  *source += 4;
237  *sourcelen -= 4;
238  }
239 
240  return PARSERUTILS_OK;
241 }
242 
285  const uint8_t **source, size_t *sourcelen,
286  uint8_t **dest, size_t *destlen)
287 {
289  parserutils_error error;
290 
291  if (c->read_len > 0) {
292  /* Output left over from last decode */
293  uint32_t *pread = c->read_buf;
294 
295  while (c->read_len > 0 && *destlen >= c->read_len * 4) {
296  *((uint32_t *) (void *) *dest) =
297  endian_host_to_big(pread[0]);
298 
299  *dest += 4;
300  *destlen -= 4;
301 
302  pread++;
303  c->read_len--;
304  }
305 
306  if (*destlen < c->read_len * 4) {
307  /* Ran out of output buffer */
308  size_t i;
309 
310  /* Shuffle remaining output down */
311  for (i = 0; i < c->read_len; i++)
312  c->read_buf[i] = pread[i];
313 
314  return PARSERUTILS_NOMEM;
315  }
316  }
317 
318  /* Finally, the "normal" case; process all outstanding characters */
319  while (*sourcelen > 0) {
321  source, sourcelen, dest, destlen);
322  if (error != PARSERUTILS_OK) {
323  return error;
324  }
325  }
326 
327  return PARSERUTILS_OK;
328 }
329 
337 {
339 
340  c->read_buf[0] = 0;
341  c->read_len = 0;
342 
343  c->write_buf[0] = 0;
344  c->write_len = 0;
345 
346  return PARSERUTILS_OK;
347 }
348 
349 
379  const uint8_t **source, size_t *sourcelen,
380  uint8_t **dest, size_t *destlen)
381 {
382  uint32_t ucs4;
383  parserutils_error error;
384 
385  /* Convert a single character */
386  error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
387  if (error == PARSERUTILS_OK) {
388  /* Read a character */
390  ucs4, dest, destlen);
391  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
392  /* output succeeded; update source pointers */
393  *source += 1;
394  *sourcelen -= 1;
395  }
396 
397  return error;
398  } else if (error == PARSERUTILS_NEEDDATA) {
399  /* Can only happen if sourcelen == 0 */
400  return error;
401  } else if (error == PARSERUTILS_INVALID) {
402  /* Illegal input sequence */
403 
404  /* Strict errormode; simply flag invalid character */
405  if (c->base.errormode ==
407  return PARSERUTILS_INVALID;
408  }
409 
410  /* output U+FFFD and continue processing. */
412  0xFFFD, dest, destlen);
413  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
414  /* output succeeded; update source pointers */
415  *source += 1;
416  *sourcelen -= 1;
417  }
418 
419  return error;
420  }
421 
422  return PARSERUTILS_OK;
423 }
424 
437  uint32_t ucs4, uint8_t **dest, size_t *destlen)
438 {
439  if (*destlen < 4) {
440  /* Run out of output buffer */
441  c->read_len = 1;
442  c->read_buf[0] = ucs4;
443 
444  return PARSERUTILS_NOMEM;
445  }
446 
447  *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
448  *dest += 4;
449  *destlen -= 4;
450 
451  return PARSERUTILS_OK;
452 }
453 
471  uint32_t ucs4, uint8_t **s, size_t *len)
472 {
473  uint8_t out = 0;
474 
475  if (*len < 1)
476  return PARSERUTILS_NOMEM;
477 
478  if (ucs4 < 0x80) {
479  /* ASCII */
480  out = ucs4;
481  } else {
483  return PARSERUTILS_INVALID;
484  else
485  out = '?';
486  }
487 
488  *(*s) = out;
489  (*s)++;
490  (*len)--;
491 
492  return PARSERUTILS_OK;
493 }
494 
507  const uint8_t *s, size_t len, uint32_t *ucs4)
508 {
509  uint32_t out;
510 
511  UNUSED(c);
512 
513  if (len < 1)
514  return PARSERUTILS_NEEDDATA;
515 
516  if (*s < 0x80) {
517  out = *s;
518  } else {
519  return PARSERUTILS_INVALID;
520  }
521 
522  *ucs4 = out;
523 
524  return PARSERUTILS_OK;
525 }
526 
530 };
531 
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
size_t len
Definition: codec_8859.c:23
static parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into US-ASCII.
Definition: codec_ascii.c:165
static parserutils_error charset_ascii_codec_output_decoded_char(charset_ascii_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_ascii.c:435
static parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from US-ASCII to UCS-4 (big endian)
Definition: codec_ascii.c:378
static parserutils_error charset_ascii_codec_destroy(parserutils_charset_codec *codec)
Destroy a US-ASCII codec.
Definition: codec_ascii.c:131
static parserutils_error charset_ascii_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a US-ASCII codec.
Definition: codec_ascii.c:97
#define READ_BUFSIZE
Definition: codec_ascii.c:24
static parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of US-ASCII data into UCS-4 (big endian)
Definition: codec_ascii.c:284
struct charset_ascii_codec charset_ascii_codec
US-ASCII charset codec.
static parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
Clear a US-ASCII codec's encoding state.
Definition: codec_ascii.c:336
static parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to US-ASCII.
Definition: codec_ascii.c:470
static bool charset_ascii_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_ascii.c:71
const parserutils_charset_handler charset_ascii_codec_handler
Definition: codec_ascii.c:527
static parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert a US-ASCII character to UCS4 (host endian)
Definition: codec_ascii.c:506
#define WRITE_BUFSIZE
Definition: codec_ascii.c:30
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
parserutils_error
Definition: errors.h:18
@ PARSERUTILS_OK
Definition: errors.h:19
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
@ PARSERUTILS_INVALID
Definition: errors.h:23
@ PARSERUTILS_NOMEM
Definition: errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
US-ASCII charset codec.
Definition: codec_ascii.c:21
size_t read_len
Character length of read_buf.
Definition: codec_ascii.c:28
size_t write_len
Character length of write_buf.
Definition: codec_ascii.c:34
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_ascii.c:25
parserutils_charset_codec base
Base class.
Definition: codec_ascii.c:22
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_ascii.c:31
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
Codec factory component definition.
Definition: codec_impl.h:39
#define UNUSED(x)
Definition: utils.h:25
#define SLEN(s)
Definition: utils.h:21