Libparserutils
Data Structures | Macros | Typedefs | Functions | Variables
codec_8859.c File Reference
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include "charset/codecs/codec_impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
#include "charset/codecs/8859_tables.h"

Go to the source code of this file.

Data Structures

struct  charset_8859_codec
 ISO-8859-n charset codec. More...
 

Macros

#define READ_BUFSIZE   (8)
 
#define WRITE_BUFSIZE   (8)
 

Typedefs

typedef struct charset_8859_codec charset_8859_codec
 ISO-8859-n charset codec. More...
 

Functions

static bool charset_8859_codec_handles_charset (const char *charset)
 Determine whether this codec handles a specific charset. More...
 
static parserutils_error charset_8859_codec_create (const char *charset, parserutils_charset_codec **codec)
 Create an ISO-8859-n codec. More...
 
static parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
 Destroy an ISO-8859-n codec. More...
 
static parserutils_error charset_8859_codec_encode (parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
 Encode a chunk of UCS-4 (big endian) data into ISO-8859-n. More...
 
static parserutils_error charset_8859_codec_decode (parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
 Decode a chunk of ISO-8859-n data into UCS-4 (big endian) More...
 
static parserutils_error charset_8859_codec_reset (parserutils_charset_codec *codec)
 Clear an ISO-8859-n codec's encoding state. More...
 
static parserutils_error charset_8859_codec_read_char (charset_8859_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
 Read a character from the ISO-8859-n to UCS-4 (big endian) More...
 
static parserutils_error charset_8859_codec_output_decoded_char (charset_8859_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
 Output a UCS-4 character (big endian) More...
 
static parserutils_error charset_8859_from_ucs4 (charset_8859_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
 Convert a UCS4 (host endian) character to ISO-8859-n. More...
 
static parserutils_error charset_8859_to_ucs4 (charset_8859_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
 Convert an ISO-8859-n character to UCS4 (host endian) More...
 

Variables

struct {
   uint16_t   mib
 
   const char *   name
 
   size_t   len
 
   uint32_t *   table
 
known_charsets []
 
const parserutils_charset_handler charset_8859_codec_handler
 

Macro Definition Documentation

◆ READ_BUFSIZE

#define READ_BUFSIZE   (8)

Definition at line 51 of file codec_8859.c.

◆ WRITE_BUFSIZE

#define WRITE_BUFSIZE   (8)

Definition at line 57 of file codec_8859.c.

Typedef Documentation

◆ charset_8859_codec

ISO-8859-n charset codec.

Function Documentation

◆ charset_8859_codec_create()

parserutils_error charset_8859_codec_create ( const char *  charset,
parserutils_charset_codec **  codec 
)
static

◆ charset_8859_codec_decode()

parserutils_error charset_8859_codec_decode ( parserutils_charset_codec codec,
const uint8_t **  source,
size_t *  sourcelen,
uint8_t **  dest,
size_t *  destlen 
)
static

Decode a chunk of ISO-8859-n data into UCS-4 (big endian)

Parameters
codecThe codec to use
sourcePointer to pointer to source data
sourcelenPointer to length (in bytes) of source data
destPointer to pointer to output buffer
destlenPointer to length (in bytes) of output buffer
Returns
PARSERUTILS_OK on success, PARSERUTILS_NOMEM if output buffer is too small, PARSERUTILS_INVALID if a character cannot be represented and the codec's error handling mode is set to STRICT,

On exit, ::source will point immediately after the last input character read, if the result is _OK or _NOMEM. Any remaining output for the character will be buffered by the codec for writing on the next call.

In the case of the result being INVALID, ::source will point _at the last input character read; nothing will be written or buffered for the failed character. It is up to the client to fix the cause of the failure and retry the decoding process.

Note that, if failure occurs whilst attempting to write any output buffered by the last call, then ::source and ::sourcelen will remain unchanged (as nothing more has been read).

If STRICT error handling is configured and an illegal sequence is split over two calls, then _INVALID will be returned from the second call, but ::source will point mid-way through the invalid sequence (i.e. it will be unmodified over the second call). In addition, the internal incomplete-sequence buffer will be emptied, such that subsequent calls will progress, rather than re-evaluating the same invalid sequence.

::sourcelen will be reduced appropriately on exit.

::dest will point immediately after the last character written.

::destlen will be reduced appropriately on exit.

Call this with a source length of 0 to flush the output buffer.

Definition at line 330 of file codec_8859.c.

References charset_8859_codec_read_char(), endian_host_to_big(), PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_8859_codec::read_buf, and charset_8859_codec::read_len.

Referenced by charset_8859_codec_create().

◆ charset_8859_codec_destroy()

parserutils_error charset_8859_codec_destroy ( parserutils_charset_codec codec)
static

Destroy an ISO-8859-n codec.

Parameters
codecThe codec to destroy
Returns
PARSERUTILS_OK on success, appropriate error otherwise

Definition at line 177 of file codec_8859.c.

References PARSERUTILS_OK, and UNUSED.

Referenced by charset_8859_codec_create().

◆ charset_8859_codec_encode()

parserutils_error charset_8859_codec_encode ( parserutils_charset_codec codec,
const uint8_t **  source,
size_t *  sourcelen,
uint8_t **  dest,
size_t *  destlen 
)
static

Encode a chunk of UCS-4 (big endian) data into ISO-8859-n.

Parameters
codecThe codec to use
sourcePointer to pointer to source data
sourcelenPointer to length (in bytes) of source data
destPointer to pointer to output buffer
destlenPointer to length (in bytes) of output buffer
Returns
PARSERUTILS_OK on success, PARSERUTILS_NOMEM if output buffer is too small, PARSERUTILS_INVALID if a character cannot be represented and the codec's error handling mode is set to STRICT,

On exit, ::source will point immediately after the last input character read. Any remaining output for the character will be buffered by the codec for writing on the next call.

Note that, if failure occurs whilst attempting to write any output buffered by the last call, then ::source and ::sourcelen will remain unchanged (as nothing more has been read).

::sourcelen will be reduced appropriately on exit.

::dest will point immediately after the last character written.

::destlen will be reduced appropriately on exit.

Definition at line 211 of file codec_8859.c.

References charset_8859_from_ucs4(), endian_big_to_host(), len, PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_8859_codec::write_buf, WRITE_BUFSIZE, and charset_8859_codec::write_len.

Referenced by charset_8859_codec_create().

◆ charset_8859_codec_handles_charset()

bool charset_8859_codec_handles_charset ( const char *  charset)
static

Determine whether this codec handles a specific charset.

Parameters
charsetCharset to test
Returns
true if handleable, false otherwise

Definition at line 98 of file codec_8859.c.

References known_charsets, len, mib, N_ELEMENTS, name, and parserutils_charset_mibenum_from_name().

◆ charset_8859_codec_output_decoded_char()

parserutils_error charset_8859_codec_output_decoded_char ( charset_8859_codec c,
uint32_t  ucs4,
uint8_t **  dest,
size_t *  destlen 
)
inlinestatic

Output a UCS-4 character (big endian)

Parameters
cCodec to use
ucs4UCS-4 character (host endian)
destPointer to pointer to output buffer
destlenPointer to output buffer length
Returns
PARSERUTILS_OK on success, PARSERUTILS_NOMEM if output buffer is too small,

Definition at line 481 of file codec_8859.c.

References endian_host_to_big(), PARSERUTILS_NOMEM, PARSERUTILS_OK, charset_8859_codec::read_buf, and charset_8859_codec::read_len.

Referenced by charset_8859_codec_read_char().

◆ charset_8859_codec_read_char()

parserutils_error charset_8859_codec_read_char ( charset_8859_codec c,
const uint8_t **  source,
size_t *  sourcelen,
uint8_t **  dest,
size_t *  destlen 
)
inlinestatic

Read a character from the ISO-8859-n to UCS-4 (big endian)

Parameters
cThe codec
sourcePointer to pointer to source buffer (updated on exit)
sourcelenPointer to length of source buffer (updated on exit)
destPointer to pointer to output buffer (updated on exit)
destlenPointer to length of output buffer (updated on exit)
Returns
PARSERUTILS_OK on success, PARSERUTILS_NOMEM if output buffer is too small, PARSERUTILS_INVALID if a character cannot be represented and the codec's error handling mode is set to STRICT,

On exit, ::source will point immediately after the last input character read, if the result is _OK or _NOMEM. Any remaining output for the character will be buffered by the codec for writing on the next call.

In the case of the result being INVALID, ::source will point _at the last input character read; nothing will be written or buffered for the failed character. It is up to the client to fix the cause of the failure and retry the decoding process.

::sourcelen will be reduced appropriately on exit.

::dest will point immediately after the last character written.

::destlen will be reduced appropriately on exit.

Definition at line 424 of file codec_8859.c.

References charset_8859_codec::base, charset_8859_codec_output_decoded_char(), charset_8859_to_ucs4(), parserutils_charset_codec::errormode, PARSERUTILS_CHARSET_CODEC_ERROR_STRICT, PARSERUTILS_INVALID, PARSERUTILS_NEEDDATA, PARSERUTILS_NOMEM, and PARSERUTILS_OK.

Referenced by charset_8859_codec_decode().

◆ charset_8859_codec_reset()

parserutils_error charset_8859_codec_reset ( parserutils_charset_codec codec)
static

Clear an ISO-8859-n codec's encoding state.

Parameters
codecThe codec to reset
Returns
PARSERUTILS_OK on success, appropriate error otherwise

Definition at line 382 of file codec_8859.c.

References PARSERUTILS_OK, charset_8859_codec::read_buf, charset_8859_codec::read_len, charset_8859_codec::write_buf, and charset_8859_codec::write_len.

Referenced by charset_8859_codec_create().

◆ charset_8859_from_ucs4()

parserutils_error charset_8859_from_ucs4 ( charset_8859_codec c,
uint32_t  ucs4,
uint8_t **  s,
size_t *  len 
)
inlinestatic

Convert a UCS4 (host endian) character to ISO-8859-n.

Parameters
cThe codec instance
ucs4The UCS4 character to convert
sPointer to pointer to destination buffer
lenPointer to destination buffer length
Returns
PARSERUTILS_OK on success, PARSERUTILS_NOMEM if there's insufficient space in the output buffer, PARSERUTILS_INVALID if the character cannot be represented

_INVALID will only be returned if the codec's conversion mode is STRICT. Otherwise, '?' will be output.

On successful conversion, *s and *len will be updated.

Definition at line 515 of file codec_8859.c.

References charset_8859_codec::base, parserutils_charset_codec::errormode, len, PARSERUTILS_CHARSET_CODEC_ERROR_STRICT, PARSERUTILS_INVALID, PARSERUTILS_NOMEM, PARSERUTILS_OK, and charset_8859_codec::table.

Referenced by charset_8859_codec_encode().

◆ charset_8859_to_ucs4()

parserutils_error charset_8859_to_ucs4 ( charset_8859_codec c,
const uint8_t *  s,
size_t  len,
uint32_t *  ucs4 
)
inlinestatic

Convert an ISO-8859-n character to UCS4 (host endian)

Parameters
cThe codec instance
sPointer to source buffer
lenSource buffer length
ucs4Pointer to destination buffer
Returns
PARSERUTILS_OK on success, PARSERUTILS_NEEDDATA if there's insufficient input data PARSERUTILS_INVALID if the character cannot be represented

Definition at line 563 of file codec_8859.c.

References len, PARSERUTILS_INVALID, PARSERUTILS_NEEDDATA, PARSERUTILS_OK, and charset_8859_codec::table.

Referenced by charset_8859_codec_read_char().

Variable Documentation

◆ charset_8859_codec_handler

const parserutils_charset_handler charset_8859_codec_handler
Initial value:
= {
}
static bool charset_8859_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_8859.c:98
static parserutils_error charset_8859_codec_create(const char *charset, parserutils_charset_codec **codec)
Create an ISO-8859-n codec.
Definition: codec_8859.c:130

Definition at line 587 of file codec_8859.c.

◆ known_charsets

struct { ... } known_charsets[]
Initial value:
= {
{ 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
{ 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
{ 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
{ 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
{ 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
{ 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
{ 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
{ 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
{ 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
{ 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
{ 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
{ 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
{ 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
{ 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
{ 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
}
#define SLEN(s)
Definition: utils.h:21
static uint32_t t16[96]
Definition: 8859_tables.h:226
static uint32_t t7[96]
Definition: 8859_tables.h:106
static uint32_t t11[96]
Definition: 8859_tables.h:166
static uint32_t t4[96]
Definition: 8859_tables.h:61
static uint32_t t8[96]
Definition: 8859_tables.h:121
static uint32_t t14[96]
Definition: 8859_tables.h:196
static uint32_t t1[96]
Definition: 8859_tables.h:16
static uint32_t t3[96]
Definition: 8859_tables.h:46
static uint32_t t9[96]
Definition: 8859_tables.h:136
static uint32_t t6[96]
Definition: 8859_tables.h:91
static uint32_t t2[96]
Definition: 8859_tables.h:31
static uint32_t t13[96]
Definition: 8859_tables.h:181
static uint32_t t10[96]
Definition: 8859_tables.h:151
static uint32_t t5[96]
Definition: 8859_tables.h:76
static uint32_t t15[96]
Definition: 8859_tables.h:211

Referenced by charset_8859_codec_create(), and charset_8859_codec_handles_charset().

◆ len

size_t len

◆ mib

uint16_t mib

Definition at line 21 of file codec_8859.c.

Referenced by charset_8859_codec_create(), and charset_8859_codec_handles_charset().

◆ name

const char* name

Definition at line 22 of file codec_8859.c.

Referenced by charset_8859_codec_handles_charset().

◆ table

uint32_t* table

Definition at line 24 of file codec_8859.c.

Referenced by charset_8859_codec_create().