diff options
Diffstat (limited to 'encode.c')
-rw-r--r-- | encode.c | 260 |
1 files changed, 260 insertions, 0 deletions
diff --git a/encode.c b/encode.c new file mode 100644 index 0000000..9139c06 --- /dev/null +++ b/encode.c @@ -0,0 +1,260 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <assert.h> + +typedef struct node { + bool is_char; + char ch; + size_t references; + + // to help when saving tree + struct node *left; + struct node *right; + + // to help encode + int direction; // 0 - left; 1 - right + struct node *parent; +} node_t; + +void node_char_init(node_t *node, size_t ref, char ch); +void node_link_init(node_t *node, node_t* left, node_t* right); + +// void liner_insert(void base[.size * .nmemb + 1], +// size_t nmemb, size_t size, void *el, +// int (*compar)(const void [.size], const void [.size])); +void liner_insert(void *base, size_t nmemb, size_t size, void *el, int (*compar)(const void *, const void *)); +int compare_nodes(const void *p1, const void *p2); + +void free_encoding_table(); +void wb_flush(size_t nmemb); + +void gen_tree(FILE *fp); +size_t write_tree_shape(node_t *root); +size_t write_tree_data(node_t *root); + +size_t write_encoded(node_t *root); +void encode(FILE *fp); + + +size_t POOL_CAP = 512; +node_t *pool; +size_t pool_sz; + +size_t WRITE_BUF_CAP = 512; +char *write_buf; + +node_t *encoding_table[256] = {0}; + +void encode(FILE *fp) +{ + // write tree // + wb_flush(1); // flush amount of bits (written in gen_tree()) + wb_flush(write_tree_shape(&pool[pool_sz-1])/8 + 1); + wb_flush(write_tree_data(&pool[pool_sz-1])); + + // encode // + char ch; + while((ch = fgetc(fp)) != EOF) { + if(encoding_table[ch] == NULL) { + fprintf(stderr, "Character %c not found encoding table\n", ch); + exit(1); + } + write_encoded(encoding_table[ch]); + } + wb_flush(write_encoded(NULL)/8+1); +} + +int main(int argc, char **argv) +{ + if(argc != 2) { + fprintf(stderr, "ERROR: Not enough command line arguments\n"); + exit(1); + } + + write_buf = malloc(sizeof(char) * WRITE_BUF_CAP); + pool = malloc(sizeof(node_t) * POOL_CAP); + + FILE *fp = fopen(argv[1], "rb"); + if(!fp) { + fprintf(stderr, "ERROR: Could not open file %s\n", argv[1]); + exit(1); + } + + gen_tree(fp); + encode(fp); + + free_encoding_table(); + free(pool); + free(write_buf); + + fclose(fp); + return 0; +} + +void gen_tree(FILE *fp) +{ + node_t *nodes[256]; + size_t nodes_sz = 0; + + size_t char_counts[256] = {0}; + char mem[1024] = {0}; size_t sz; + + // load char counts + while((sz = fread(mem, sizeof(char), 1024, fp)) != 0) + { + for(size_t i = 0; i < sz; i++) { + char_counts[(size_t)mem[i]]++; + } + } + rewind(fp); + + // make the tree + for(int i = 0; i < 256; i++) { + if(char_counts[i] <= 0) continue; + + nodes[nodes_sz] = malloc(sizeof(node_t)); + node_char_init(nodes[nodes_sz], char_counts[i], (char)i); + encoding_table[i] = nodes[nodes_sz]; + nodes_sz++; + } + + qsort(nodes, nodes_sz, sizeof(node_t *), compare_nodes); + + // save the amount of bits + write_buf[0] = nodes_sz * 2 - 1; + + // pair last 2 elements and do linear sort + for(pool_sz = 0; nodes_sz != 1; nodes_sz--, pool_sz++) + { + if(pool_sz >= POOL_CAP) { + fprintf(stderr, "ERROR: POOL_CAP reached, please re-run the program with a bigger POOL_CAP\n"); + exit(1); + } + + node_link_init(&pool[pool_sz], nodes[nodes_sz-1], nodes[nodes_sz-2]); + liner_insert(nodes, nodes_sz-2, sizeof(node_t *), &pool[pool_sz], compare_nodes); + } +} + +size_t write_encoded(node_t *root) +{ + static size_t bits = 0; + if(root == NULL || root->parent == NULL) return bits; + + write_buf[bits/8] |= root->direction << (bits % 8); + bits++; + + if(bits/8 == WRITE_BUF_CAP) { + wb_flush(WRITE_BUF_CAP); + bits = 0; + } + + write_encoded(root->parent); + return bits; +} + +size_t write_tree_shape(node_t *root) +{ + static size_t bits = 0; + if(root == NULL) return bits; + + write_buf[bits/8] |= root->is_char << (8 - (bits%8)); + bits++; + + if(bits/8 == WRITE_BUF_CAP) { + wb_flush(WRITE_BUF_CAP); + bits = 0; + } + + write_tree_shape(root->left); + write_tree_shape(root->right); + + return bits; +} + +size_t write_tree_data(node_t *root) +{ + static size_t n = 0; + if(root == NULL) return n; + + if(root->is_char) { + if(n == WRITE_BUF_CAP) { + wb_flush(WRITE_BUF_CAP); + n = 0; + } + write_buf[n++] = root->ch; + } + + write_tree_data(root->left); + write_tree_data(root->right); + + return n; +} + +void wb_flush(size_t nmemb) +{ + fwrite(write_buf, sizeof(char), nmemb, stdout); + memset(write_buf, 0, sizeof(char) * WRITE_BUF_CAP); +} + +void free_encoding_table() +{ + for(int i = 0; i < 256; i++) { + if(encoding_table[i] != NULL) { + free(encoding_table[i]); + } + } +} + +int compare_nodes(const void *p1, const void *p2) +{ + return (*(node_t **)p2)->references - (*(node_t **)p1)->references; +} + +void liner_insert(void *base, size_t nmemb, size_t size, void *el, int (*compar)(const void *, const void *)) +{ + for(size_t i = 0; i < nmemb; i++) + { + void *cur = base + (i * size); + if(compar(cur, &el) < 0) continue; + for(size_t j = nmemb-1; j >= i; j--) + { + if(j == (size_t)-1) break; //integer underflow + + void *src = base + (j * size); + void *dest = src + size; + memcpy(dest, src, size); + } + + memcpy(cur, &el, size); + return; + } + + memcpy(base + (nmemb * size), &el, size); +} + +void node_char_init(node_t *node, size_t ref, char ch) +{ + node->is_char = true; + node->ch = ch; + node->references = ref; + node->parent = NULL; + node->left = NULL; + node->right = NULL; +} + +void node_link_init(node_t *node, node_t* left, node_t* right) +{ + node->is_char = false; + node->references = left->references + right->references; + node->parent = NULL; + node->left = left; + node->right = right; + + left->parent = node; + right->parent = node; + left->direction = 0; + right->direction = 1; +} |