00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include "huff_code.h"
00039 #include "ckd_alloc.h"
00040 #include "hash_table.h"
00041 #include "byteorder.h"
00042 #include "heap.h"
00043 #include "pio.h"
00044 #include "err.h"
00045
00046 #include <string.h>
00047
00048 typedef struct huff_node_s {
00049 int nbits;
00050 struct huff_node_s *l;
00051 union {
00052 int32 ival;
00053 char *sval;
00054 struct huff_node_s *r;
00055 } r;
00056 } huff_node_t;
00057
00058 typedef struct huff_codeword_s {
00059 union {
00060 int32 ival;
00061 char *sval;
00062 } r;
00063 uint32 nbits, codeword;
00064 } huff_codeword_t;
00065
00066 enum {
00067 HUFF_CODE_INT,
00068 HUFF_CODE_STR
00069 };
00070
00071 struct huff_code_s {
00072 int16 refcount;
00073 uint8 maxbits;
00074 uint8 type;
00075 uint32 *firstcode;
00076 uint32 *numl;
00077 huff_codeword_t **syms;
00078 hash_table_t *codewords;
00079 FILE *fh;
00080 bit_encode_t *be;
00081 int boff;
00082 };
00083
00084 static huff_node_t *
00085 huff_node_new_int(int32 val)
00086 {
00087 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00088 hn->r.ival = val;
00089 return hn;
00090 }
00091
00092 static huff_node_t *
00093 huff_node_new_str(char const *val)
00094 {
00095 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00096 hn->r.sval = ckd_salloc(val);
00097 return hn;
00098 }
00099
00100 static huff_node_t *
00101 huff_node_new_parent(huff_node_t *l, huff_node_t *r)
00102 {
00103 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00104 hn->l = l;
00105 hn->r.r = r;
00106
00107 if (r->nbits > l->nbits)
00108 hn->nbits = r->nbits + 1;
00109 else
00110 hn->nbits = l->nbits + 1;
00111 return hn;
00112 }
00113
00114 static void
00115 huff_node_free_int(huff_node_t *root)
00116 {
00117 if (root->l) {
00118 huff_node_free_int(root->l);
00119 huff_node_free_int(root->r.r);
00120 }
00121 ckd_free(root);
00122 }
00123
00124 static void
00125 huff_node_free_str(huff_node_t *root, int freestr)
00126 {
00127 if (root->l) {
00128 huff_node_free_str(root->l, freestr);
00129 huff_node_free_str(root->r.r, freestr);
00130 }
00131 else {
00132 if (freestr)
00133 ckd_free(root->r.sval);
00134 }
00135 ckd_free(root);
00136 }
00137
00138 static huff_node_t *
00139 huff_code_build_tree(heap_t q)
00140 {
00141 huff_node_t *root = NULL;
00142 int32 rf;
00143
00144 while (heap_size(q) > 1) {
00145 huff_node_t *l, *r, *p;
00146 int32 lf, rf;
00147
00148 heap_pop(q, (void *)&l, &lf);
00149 heap_pop(q, (void *)&r, &rf);
00150 p = huff_node_new_parent(l, r);
00151 heap_insert(q, p, lf + rf);
00152 }
00153 heap_pop(q, (void **)&root, &rf);
00154 return root;
00155 }
00156
00157 static void
00158 huff_code_canonicalize(huff_code_t *hc, huff_node_t *root)
00159 {
00160 glist_t agenda;
00161 uint32 *nextcode;
00162 int i, ncw;
00163
00164 hc->firstcode = ckd_calloc(hc->maxbits+1, sizeof(*hc->firstcode));
00165 hc->syms = ckd_calloc(hc->maxbits+1, sizeof(*hc->syms));
00166 hc->numl = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
00167 nextcode = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
00168
00169
00170
00171 root->nbits = 0;
00172 ncw = 0;
00173 agenda = glist_add_ptr(NULL, root);
00174 while (agenda) {
00175 huff_node_t *node = gnode_ptr(agenda);
00176 agenda = gnode_free(agenda, NULL);
00177 if (node->l) {
00178 node->l->nbits = node->nbits + 1;
00179 agenda = glist_add_ptr(agenda, node->l);
00180 node->r.r->nbits = node->nbits + 1;
00181 agenda = glist_add_ptr(agenda, node->r.r);
00182 }
00183 else {
00184 hc->numl[node->nbits]++;
00185 ncw++;
00186 }
00187 }
00188
00189 hc->syms[hc->maxbits] = ckd_calloc(hc->numl[hc->maxbits], sizeof(**hc->syms));
00190 for (i = hc->maxbits - 1; i > 0; --i) {
00191 hc->firstcode[i] = (hc->firstcode[i+1] + hc->numl[i+1]) / 2;
00192 hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
00193 }
00194 memcpy(nextcode, hc->firstcode, (hc->maxbits + 1) * sizeof(*nextcode));
00195
00196 hc->codewords = hash_table_new(ncw, HASH_CASE_YES);
00197 agenda = glist_add_ptr(NULL, root);
00198 while (agenda) {
00199 huff_node_t *node = gnode_ptr(agenda);
00200 agenda = gnode_free(agenda, NULL);
00201 if (node->l) {
00202 agenda = glist_add_ptr(agenda, node->l);
00203 agenda = glist_add_ptr(agenda, node->r.r);
00204 }
00205 else {
00206
00207 huff_codeword_t *cw;
00208 uint32 codeword = nextcode[node->nbits] & ((1 << node->nbits) - 1);
00209 cw = hc->syms[node->nbits] + (codeword - hc->firstcode[node->nbits]);
00210 cw->nbits = node->nbits;
00211 cw->r.sval = node->r.sval;
00212 cw->codeword = codeword;
00213 if (hc->type == HUFF_CODE_INT) {
00214 hash_table_enter_bkey(hc->codewords,
00215 (char const *)&cw->r.ival,
00216 sizeof(cw->r.ival),
00217 (void *)cw);
00218 }
00219 else {
00220 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
00221 }
00222 ++nextcode[node->nbits];
00223 }
00224 }
00225 ckd_free(nextcode);
00226 }
00227
00228 huff_code_t *
00229 huff_code_build_int(int32 const *values, int32 const *frequencies, int nvals)
00230 {
00231 huff_code_t *hc;
00232 huff_node_t *root;
00233 heap_t q;
00234 int i;
00235
00236 hc = ckd_calloc(1, sizeof(*hc));
00237 hc->refcount = 1;
00238 hc->type = HUFF_CODE_INT;
00239
00240
00241 q = heap_new();
00242 for (i = 0; i < nvals; ++i) {
00243 heap_insert(q,
00244 huff_node_new_int(values[i]),
00245 frequencies[i]);
00246 }
00247
00248
00249 root = huff_code_build_tree(q);
00250 heap_destroy(q);
00251 if (root == NULL || root->nbits > 32) {
00252 E_ERROR("Huffman trees currently limited to 32 bits\n");
00253 huff_node_free_int(root);
00254 huff_code_free(hc);
00255 return NULL;
00256 }
00257
00258
00259 hc->maxbits = root->nbits;
00260 huff_code_canonicalize(hc, root);
00261
00262
00263 huff_node_free_int(root);
00264
00265 return hc;
00266 }
00267
00268 huff_code_t *
00269 huff_code_build_str(char * const *values, int32 const *frequencies, int nvals)
00270 {
00271 huff_code_t *hc;
00272 huff_node_t *root;
00273 heap_t q;
00274 int i;
00275
00276 hc = ckd_calloc(1, sizeof(*hc));
00277 hc->refcount = 1;
00278 hc->type = HUFF_CODE_STR;
00279
00280
00281 q = heap_new();
00282 for (i = 0; i < nvals; ++i) {
00283 heap_insert(q,
00284 huff_node_new_str(values[i]),
00285 frequencies[i]);
00286 }
00287
00288
00289 root = huff_code_build_tree(q);
00290 heap_destroy(q);
00291 if (root == NULL || root->nbits > 32) {
00292 E_ERROR("Huffman trees currently limited to 32 bits\n");
00293 huff_node_free_str(root, TRUE);
00294 huff_code_free(hc);
00295 return NULL;
00296 }
00297
00298
00299 hc->maxbits = root->nbits;
00300 huff_code_canonicalize(hc, root);
00301
00302
00303 huff_node_free_str(root, FALSE);
00304
00305 return hc;
00306 }
00307
00308 huff_code_t *
00309 huff_code_read(FILE *infh)
00310 {
00311 huff_code_t *hc;
00312 int i, j;
00313
00314 hc = ckd_calloc(1, sizeof(*hc));
00315 hc->refcount = 1;
00316
00317 hc->maxbits = fgetc(infh);
00318 hc->type = fgetc(infh);
00319
00320
00321 fgetc(infh);
00322 fgetc(infh);
00323
00324
00325 hc->firstcode = ckd_calloc(hc->maxbits + 1, sizeof(*hc->firstcode));
00326 hc->numl = ckd_calloc(hc->maxbits + 1, sizeof(*hc->numl));
00327 hc->syms = ckd_calloc(hc->maxbits + 1, sizeof(*hc->syms));
00328
00329
00330 hc->codewords = hash_table_new(hc->maxbits, HASH_CASE_YES);
00331 for (i = 1; i <= hc->maxbits; ++i) {
00332 if (fread(&hc->firstcode[i], 4, 1, infh) != 1)
00333 goto error_out;
00334 SWAP_BE_32(&hc->firstcode[i]);
00335 if (fread(&hc->numl[i], 4, 1, infh) != 1)
00336 goto error_out;
00337 SWAP_BE_32(&hc->numl[i]);
00338 hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
00339 for (j = 0; j < hc->numl[i]; ++j) {
00340 huff_codeword_t *cw = &hc->syms[i][j];
00341 cw->nbits = i;
00342 cw->codeword = hc->firstcode[i] + j;
00343 if (hc->type == HUFF_CODE_INT) {
00344 if (fread(&cw->r.ival, 4, 1, infh) != 1)
00345 goto error_out;
00346 SWAP_BE_32(&cw->r.ival);
00347 hash_table_enter_bkey(hc->codewords,
00348 (char const *)&cw->r.ival,
00349 sizeof(cw->r.ival),
00350 (void *)cw);
00351 }
00352 else {
00353 size_t len;
00354 cw->r.sval = fread_line(infh, &len);
00355 cw->r.sval[len-1] = '\0';
00356 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
00357 }
00358 }
00359 }
00360
00361 return hc;
00362 error_out:
00363 huff_code_free(hc);
00364 return NULL;
00365 }
00366
00367 int
00368 huff_code_write(huff_code_t *hc, FILE *outfh)
00369 {
00370 int i, j;
00371
00372
00373 fputc(hc->maxbits, outfh);
00374
00375 fputc(hc->type, outfh);
00376
00377 fputc(0, outfh);
00378 fputc(0, outfh);
00379
00380 for (i = 1; i <= hc->maxbits; ++i) {
00381 uint32 val;
00382
00383
00384 val = hc->firstcode[i];
00385
00386 SWAP_BE_32(&val);
00387 fwrite(&val, 4, 1, outfh);
00388 val = hc->numl[i];
00389 SWAP_BE_32(&val);
00390 fwrite(&val, 4, 1, outfh);
00391
00392
00393 for (j = 0; j < hc->numl[i]; ++j) {
00394 if (hc->type == HUFF_CODE_INT) {
00395 int32 val = hc->syms[i][j].r.ival;
00396 SWAP_BE_32(&val);
00397 fwrite(&val, 4, 1, outfh);
00398 }
00399 else {
00400
00401
00402 fprintf(outfh, "%s\n", hc->syms[i][j].r.sval);
00403 }
00404 }
00405 }
00406 return 0;
00407 }
00408
00409 int
00410 huff_code_dump_codebits(FILE *dumpfh, uint32 nbits, uint32 codeword)
00411 {
00412 uint32 i;
00413
00414 for (i = 0; i < nbits; ++i)
00415 fputc((codeword & (1<<(nbits-i-1))) ? '1' : '0', dumpfh);
00416 return 0;
00417 }
00418
00419 int
00420 huff_code_dump(huff_code_t *hc, FILE *dumpfh)
00421 {
00422 int i, j;
00423
00424
00425 fprintf(dumpfh, "Maximum codeword length: %d\n", hc->maxbits);
00426 fprintf(dumpfh, "Symbols are %s\n", (hc->type == HUFF_CODE_STR) ? "strings" : "ints");
00427 fprintf(dumpfh, "Codewords:\n");
00428 for (i = 1; i <= hc->maxbits; ++i) {
00429 for (j = 0; j < hc->numl[i]; ++j) {
00430 if (hc->type == HUFF_CODE_STR)
00431 fprintf(dumpfh, "%-30s", hc->syms[i][j].r.sval);
00432 else
00433 fprintf(dumpfh, "%-30d", hc->syms[i][j].r.ival);
00434 huff_code_dump_codebits(dumpfh, hc->syms[i][j].nbits,
00435 hc->syms[i][j].codeword);
00436 fprintf(dumpfh, "\n");
00437 }
00438 }
00439 return 0;
00440 }
00441
00442 huff_code_t *
00443 huff_code_retain(huff_code_t *hc)
00444 {
00445 ++hc->refcount;
00446 return hc;
00447 }
00448
00449 int
00450 huff_code_free(huff_code_t *hc)
00451 {
00452 int i;
00453
00454 if (hc == NULL)
00455 return 0;
00456 if (--hc->refcount > 0)
00457 return hc->refcount;
00458 for (i = 0; i <= hc->maxbits; ++i) {
00459 int j;
00460 for (j = 0; j < hc->numl[i]; ++j) {
00461 if (hc->type == HUFF_CODE_STR)
00462 ckd_free(hc->syms[i][j].r.sval);
00463 }
00464 ckd_free(hc->syms[i]);
00465 }
00466 ckd_free(hc->firstcode);
00467 ckd_free(hc->numl);
00468 ckd_free(hc->syms);
00469 hash_table_free(hc->codewords);
00470 ckd_free(hc);
00471 return 0;
00472 }
00473
00474 FILE *
00475 huff_code_attach(huff_code_t *hc, FILE *fh, char const *mode)
00476 {
00477 FILE *oldfh = huff_code_detach(hc);
00478
00479 hc->fh = fh;
00480 if (mode[0] == 'w')
00481 hc->be = bit_encode_attach(hc->fh);
00482 return oldfh;
00483 }
00484
00485 FILE *
00486 huff_code_detach(huff_code_t *hc)
00487 {
00488 FILE *oldfh = hc->fh;
00489
00490 if (hc->be) {
00491 bit_encode_flush(hc->be);
00492 bit_encode_free(hc->be);
00493 hc->be = NULL;
00494 }
00495 hc->fh = NULL;
00496 return oldfh;
00497 }
00498
00499 int
00500 huff_code_encode_int(huff_code_t *hc, int32 sym, uint32 *outcw)
00501 {
00502 huff_codeword_t *cw;
00503
00504 if (hash_table_lookup_bkey(hc->codewords,
00505 (char const *)&sym,
00506 sizeof(sym),
00507 (void **)&cw) < 0)
00508 return 0;
00509 if (hc->be)
00510 bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
00511 if (outcw) *outcw = cw->codeword;
00512 return cw->nbits;
00513 }
00514
00515 int
00516 huff_code_encode_str(huff_code_t *hc, char const *sym, uint32 *outcw)
00517 {
00518 huff_codeword_t *cw;
00519
00520 if (hash_table_lookup(hc->codewords,
00521 sym,
00522 (void **)&cw) < 0)
00523 return 0;
00524 if (hc->be)
00525 bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
00526 if (outcw) *outcw = cw->codeword;
00527 return cw->nbits;
00528 }
00529
00530 static huff_codeword_t *
00531 huff_code_decode_data(huff_code_t *hc, char const **inout_data,
00532 size_t *inout_data_len, int *inout_offset)
00533 {
00534 char const *data = *inout_data;
00535 char const *end = data + *inout_data_len;
00536 int offset = *inout_offset;
00537 uint32 cw;
00538 int cwlen;
00539 int byte;
00540
00541 if (data == end)
00542 return NULL;
00543 byte = *data++;
00544 cw = !!(byte & (1 << (7-offset++)));
00545 cwlen = 1;
00546
00547 while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
00548 ++cwlen;
00549 cw <<= 1;
00550 if (offset > 7) {
00551 if (data == end)
00552 return NULL;
00553 byte = *data++;
00554 offset = 0;
00555 }
00556 cw |= !!(byte & (1 << (7-offset++)));
00557
00558 }
00559 if (cwlen > hc->maxbits)
00560 return NULL;
00561
00562
00563 if (offset < 8)
00564 --data;
00565 else
00566 offset = 0;
00567
00568
00569 *inout_data_len = end - data;
00570 *inout_data = data;
00571 *inout_offset = offset;
00572 return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
00573 }
00574
00575 static huff_codeword_t *
00576 huff_code_decode_fh(huff_code_t *hc)
00577 {
00578 uint32 cw;
00579 int cwlen;
00580 int byte;
00581
00582 if ((byte = fgetc(hc->fh)) == EOF)
00583 return NULL;
00584 cw = !!(byte & (1 << (7-hc->boff++)));
00585 cwlen = 1;
00586
00587 while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
00588 ++cwlen;
00589 cw <<= 1;
00590 if (hc->boff > 7) {
00591 if ((byte = fgetc(hc->fh)) == EOF)
00592 return NULL;
00593 hc->boff = 0;
00594 }
00595 cw |= !!(byte & (1 << (7-hc->boff++)));
00596
00597 }
00598 if (cwlen > hc->maxbits)
00599 return NULL;
00600
00601
00602 if (hc->boff < 8)
00603 ungetc(byte, hc->fh);
00604 else
00605 hc->boff = 0;
00606
00607
00608 return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
00609 }
00610
00611 int
00612 huff_code_decode_int(huff_code_t *hc, int *outval,
00613 char const **inout_data,
00614 size_t *inout_data_len, int *inout_offset)
00615 {
00616 huff_codeword_t *cw;
00617
00618 if (inout_data)
00619 cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
00620 else if (hc->fh)
00621 cw = huff_code_decode_fh(hc);
00622 else
00623 return -1;
00624
00625 if (cw == NULL)
00626 return -1;
00627 if (outval)
00628 *outval = cw->r.ival;
00629
00630 return 0;
00631 }
00632
00633 char const *
00634 huff_code_decode_str(huff_code_t *hc,
00635 char const **inout_data,
00636 size_t *inout_data_len, int *inout_offset)
00637 {
00638 huff_codeword_t *cw;
00639
00640 if (inout_data)
00641 cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
00642 else if (hc->fh)
00643 cw = huff_code_decode_fh(hc);
00644 else
00645 return NULL;
00646
00647 if (cw == NULL)
00648 return NULL;
00649
00650 return cw->r.sval;
00651 }