From: Geoffrey Allott Date: Thu, 1 Sep 2022 21:04:45 +0000 (+0100) Subject: add entropy tool and stree-only mode; fix probability counting X-Git-Url: https://git.pointlesshacks.com/?a=commitdiff_plain;h=6c2dcc04290a2af6e7f02fa45788c1a3ae339016;p=tANS.git add entropy tool and stree-only mode; fix probability counting --- diff --git a/Makefile b/Makefile index 2c1469f..415b915 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,11 @@ OBJS = $(patsubst %.c, %.o, $(wildcard src/*.c)) TEST_OBJS = $(patsubst %.c, %.o, $(wildcard test/*.c)) TESTS = $(patsubst %.c, %, $(wildcard test/*.c)) RUN_TESTS = $(addprefix run_, $(TESTS)) +TOOLS = $(patsubst %.c, %, $(wildcard tools/*.c)) -default: $(BIN) $(RUN_TESTS) +default: $(BIN) $(RUN_TESTS) $(TOOLS) + +tools: $(TOOLS) $(RUN_TESTS): run_%: % $^ @@ -30,5 +33,6 @@ clean: rm -f $(TEST_OBJS) rm -f $(TESTS) rm -f $(BIN) + rm -f $(TOOLS) -.PHONY: default clean $(RUN_TESTS) +.PHONY: default clean tools $(RUN_TESTS) diff --git a/src/tANS.c b/src/tANS.c index e4ce4d4..bb3f1d4 100644 --- a/src/tANS.c +++ b/src/tANS.c @@ -20,19 +20,23 @@ ref: https://arxiv.org/abs/1311.2540 #include #define MAX_BUFSZ 1048576 +#define INIT_READSZ 1024 #define N_SYMBOLS 256 #define N_AUX 3 +#define LOG2_TBLSZ 12 static void usage(void) { printf( - "usage: tANS [-hcdz] [-S .suf] [file...]\n" + "usage: tANS [-hcdstz] [-S .suf] [file...]\n" "\n" "Compress the given files using suffix trees and tabled Asymmetric Numeral Systems\n" "\n" " -h - Show this help text\n" " -c - Send output to stdout\n" " -d - Decompress\n" + " -s - Perform suffix tree transformation only\n" + " -t - Perform tANS-rl (tabled Asymmetric Numeral Systems with run-length encoding) compression only\n" " -z - Compress (default)\n" "\n" " -S .suf - Use the given suffix instead of \".ans\"\n" @@ -44,49 +48,9 @@ static size_t tANS_max_compressed_size(size_t len) return len * 2; } -static void tANS_set_default_probabilities(double p[static 3][256]) -{ - int i; - - p[0][0] = 25.0; - p[0][1] = 15.0; - p[0][2] = 5.0; - p[0][3] = 4.0; - p[0][4] = 3.0; - p[0][5] = 2.0; - p[0][6] = 1.0; - - for (i = 7; i < 256; ++i) - p[0][i] = 0.0; - - p[1][0] = 40.0; - p[1][1] = 20.0; - p[1][2] = 15.0; - p[1][3] = 5.0; - p[1][4] = 4.0; - p[1][5] = 3.0; - p[1][6] = 2.0; - p[1][7] = 1.0; - - for (i = 8; i < 256; ++i) - p[1][i] = 0.0; - - p[2][0] = 0.0; - p[2][1] = 20.0; - p[2][2] = 10.0; - p[2][3] = 5.0; - p[2][4] = 4.0; - p[2][5] = 3.0; - p[2][6] = 2.0; - p[2][7] = 1.0; - - for (i = 8; i < 256; ++i) - p[0][i] = 0.0; -} - static int tANS_compress_file(FILE* input, FILE *output) { - uint32_t i, len, bits; + uint32_t i, len, bits, count; uint8_t *read_buf; uint8_t *enc_buf; uint8_t *aux_buf; @@ -95,7 +59,7 @@ static int tANS_compress_file(FILE* input, FILE *output) struct tANS_freq_tbl *freq_tbls; struct tANS_symbol_tbl *symbol_tbls; struct tANS_rl_encode_st *st; - const uint16_t log2_tblsz = 10; + const uint16_t log2_tblsz = LOG2_TBLSZ; uint32_t read_sz = 1024; uint32_t magic = TANS_MAGIC; @@ -109,8 +73,6 @@ static int tANS_compress_file(FILE* input, FILE *output) if (!freq_tbls || !symbol_tbls || !st || !read_buf || !enc_buf || !aux_buf || !write_buf) goto fail; - tANS_set_default_probabilities(p); - if (fwrite(&magic, sizeof magic, 1, output) != 1) goto fail; while (!feof(input)) { @@ -122,17 +84,19 @@ static int tANS_compress_file(FILE* input, FILE *output) len = (uint32_t) fread(read_buf, 1, read_sz, input); if (stree_encode(len, read_buf, enc_buf, aux_buf) != 0) goto fail; + for (i = 0; i < len; ++i) { ++p[0][enc_buf[i]]; if (enc_buf[i] == 0) { - size_t count = 0; - while (enc_buf[++i] == 0 && count < 255) ++count; - ++p[1][count]; - --i; - } else { + while (enc_buf[i] == 0) { + count = 0; + while (enc_buf[++i] == 0 && count < 255) ++count; + ++p[1][count]; + } ++p[2][enc_buf[i]]; } } + if (fwrite(&len, sizeof len, 1, output) != 1) goto fail; bits = tANS_rl_encode(st, enc_buf, len, write_buf); if (fwrite(&bits, sizeof bits, 1, output) != 1) goto fail; @@ -167,7 +131,7 @@ fail: static int tANS_decompress_file(FILE* input, FILE *output) { - uint32_t i, len, bits; + uint32_t i, len, bits, count; uint8_t *read_buf; uint8_t *enc_buf; uint8_t *aux_buf; @@ -176,12 +140,12 @@ static int tANS_decompress_file(FILE* input, FILE *output) struct tANS_freq_tbl *freq_tbls; struct tANS_symbol_tbl *symbol_tbls; struct tANS_rl_decode_st *st; - const uint16_t log2_tblsz = 10; + const uint16_t log2_tblsz = LOG2_TBLSZ; uint32_t magic; freq_tbls = malloc(sizeof(struct tANS_freq_tbl) * N_AUX); symbol_tbls = malloc(sizeof(struct tANS_symbol_tbl) * N_AUX); - st = malloc(sizeof(struct tANS_rl_encode_st)); + st = malloc(sizeof(struct tANS_rl_decode_st)); read_buf = malloc(tANS_max_compressed_size(MAX_BUFSZ)); enc_buf = malloc(MAX_BUFSZ); aux_buf = malloc(MAX_BUFSZ); @@ -195,8 +159,6 @@ static int tANS_decompress_file(FILE* input, FILE *output) goto fail; } - tANS_set_default_probabilities(p); - while (!feof(input)) { for (i = 0; i < N_AUX; ++i) { if (tANS_freq_tbl_init(freq_tbls + i, N_SYMBOLS, p[i], log2_tblsz) != 0) goto fail; @@ -219,11 +181,11 @@ static int tANS_decompress_file(FILE* input, FILE *output) for (i = 0; i < len; ++i) { ++p[0][enc_buf[i]]; if (enc_buf[i] == 0) { - size_t count = 0; - while (enc_buf[++i] == 0 && count < 255) ++count; - ++p[1][count]; - --i; - } else { + while (enc_buf[i] == 0) { + count = 0; + while (enc_buf[++i] == 0 && count < 255) ++count; + ++p[1][count]; + } ++p[2][enc_buf[i]]; } } @@ -249,14 +211,83 @@ fail: return -1; } +static int stree_compress_file(FILE *input, FILE *output, uint32_t read_sz) +{ + uint32_t len; + uint8_t *read_buf; + uint8_t *write_buf; + uint8_t *aux_buf; + + read_buf = malloc(read_sz); + write_buf = malloc(read_sz); + aux_buf = malloc(read_sz); + + if (!read_buf || !write_buf || !aux_buf) goto fail; + + while (!feof(input)) { + len = (uint32_t) fread(read_buf, 1, read_sz, input); + if (stree_encode(len, read_buf, write_buf, aux_buf) != 0) goto fail; + if (fwrite(write_buf, len, 1, output) != 1) goto fail; + } + + free(read_buf); + free(write_buf); + free(aux_buf); + return 0; + +fail: + free(read_buf); + free(write_buf); + free(aux_buf); + return -1; +} + +static int stree_decompress_file(FILE *input, FILE *output, uint32_t read_sz) +{ + uint32_t len; + uint8_t *read_buf; + uint8_t *write_buf; + uint8_t *aux_buf; + + read_buf = malloc(read_sz); + write_buf = malloc(read_sz); + aux_buf = malloc(read_sz); + + if (!read_buf || !write_buf || !aux_buf) goto fail; + + while (!feof(input)) { + len = (uint32_t) fread(read_buf, 1, read_sz, input); + if (stree_decode(len, read_buf, write_buf, aux_buf) != 0) goto fail; + if (fwrite(write_buf, len, 1, output) != 1) goto fail; + } + + free(read_buf); + free(write_buf); + free(aux_buf); + return 0; + +fail: + free(read_buf); + free(write_buf); + free(aux_buf); + return -1; +} + +enum mode { + mode_both, + mode_stree_only, + mode_tans_only, +}; + int main(int argc, char *argv[]) { int ret, opt, to_stdout = 0, compress = 1; + enum mode mode = mode_both; FILE *input = stdin, *output = stdout; const char *suffix = ".ans"; char outpath[1024]; - while ((opt = getopt(argc, argv, "hcdzS:")) != -1) { + while ((opt = getopt(argc, argv, "hcdstzS:")) != -1) { switch (opt) { case 'h': usage(); @@ -267,6 +298,12 @@ int main(int argc, char *argv[]) case 'd': compress = 0; break; + case 's': + mode = mode_stree_only; + break; + case 't': + mode = mode_tans_only; + break; case 'z': compress = 1; break; @@ -281,9 +318,17 @@ int main(int argc, char *argv[]) if (argc == 0) { if (compress) { - return tANS_compress_file(input, output) != 0; + if (mode == mode_stree_only) { + return stree_compress_file(input, output, MAX_BUFSZ) != 0; + } else { + return tANS_compress_file(input, output) != 0; + } } else { - return tANS_decompress_file(input, output) != 0; + if (mode == mode_stree_only) { + return stree_decompress_file(input, output, MAX_BUFSZ) != 0; + } else { + return tANS_decompress_file(input, output) != 0; + } } } else { for (; argc >= 1; --argc, ++argv) { @@ -304,9 +349,17 @@ int main(int argc, char *argv[]) } if (compress) { - ret = tANS_compress_file(input, output); + if (mode == mode_stree_only) { + ret = stree_compress_file(input, output, MAX_BUFSZ) != 0; + } else { + ret = tANS_compress_file(input, output) != 0; + } } else { - ret = tANS_decompress_file(input, output); + if (mode == mode_stree_only) { + ret = stree_decompress_file(input, output, MAX_BUFSZ) != 0; + } else { + ret = tANS_decompress_file(input, output); + } } fclose(input); diff --git a/tools/entropy.c b/tools/entropy.c new file mode 100644 index 0000000..8813e44 --- /dev/null +++ b/tools/entropy.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include + +static double bit_entropy(FILE *input, int bits, int chain_len, int low_est_threshold) +{ + size_t *counts; + uint8_t *chain; + size_t i, j, k, len, index, subtotal, total; + double p, entropy; + uint8_t buf[65536]; + + counts = calloc((size_t) 1 << (bits * chain_len), sizeof(size_t)); + chain = calloc((size_t) chain_len, sizeof(uint8_t)); + + if (!counts || !chain) goto fail; + + while (!feof(input)) { + if ((len = fread(buf, 1, sizeof buf, input)) == 0 && !feof(input)) goto fail; + for (i = 0; i < len; ++i) { + for (j = 0; j < 8; j += (size_t) bits) { + chain[chain_len-1] = (uint8_t) ((buf[i] >> j) & ((1 << bits) - 1)); + for (k = 0, index = 0; k < (size_t) chain_len; ++k) { + index <<= bits; + index |= chain[k]; + } + + ++counts[index]; + + for (k = 0; k + 1 < (size_t) chain_len; ++k) + chain[k] = chain[k+1]; + } + } + } + + entropy = 0; + total = 0; + for (i = 0; i < ((size_t) 1 << (bits * (chain_len - 1))); ++i) { + for (j = 0, subtotal = 0; j < ((size_t) 1 << bits); ++j) + subtotal += counts[(i << bits) + j]; + total += subtotal; + if (subtotal < (size_t) low_est_threshold) { + entropy += (double) ((size_t) bits * subtotal); + } else { + for (j = 0; j < ((size_t) 1 << bits); ++j) { + if (counts[(i << bits) + j] > 0) { + p = (double) counts[(i << bits) + j] / (double) subtotal; + entropy -= log2(p) * (double) counts[(i << bits) + j]; + } + } + } + } + + entropy /= (double) (total * (size_t) bits); + + free(counts); + free(chain); + return entropy; + +fail: + free(counts); + free(chain); + return 0.0/0.0; +} + +static void usage(void) +{ + printf( + "usage: entropy [-h] [-b bits] [-c len] [-t threshold] [file...]\n" + "\n" + "Compress the given files using suffix trees and tabled Asymmetric Numeral Systems\n" + "\n" + " -h - Show this help text\n" + " -b - Decode the specified number of bits at a time (1, 2, 4, or (default) 8)\n" + " -c - Keep a markov chain `len' symbols long (default 1)\n" + " -t - Estimate maximal entropy if subtotal < `threshold' (default 0)\n" + ); +} + +int main(int argc, char *argv[]) +{ + int opt, bits = 8, chain_len = 1, low_est_threshold = 0; + FILE *input; + + while ((opt = getopt(argc, argv, "hb:c:t:")) != -1) { + switch (opt) { + case 'h': + usage(); + return 0; + case 'b': + bits = atoi(optarg); + break; + case 'c': + chain_len = atoi(optarg); + break; + case 't': + low_est_threshold = atoi(optarg); + break; + } + } + + if ((bits != 1 && bits != 2 && bits != 4 && bits != 8) || chain_len < 1 || low_est_threshold < 0) { + usage(); + return 1; + } + + argv += optind; + argc -= optind; + + if (argc == 0) { + printf("%f\n", bit_entropy(stdin, bits, chain_len, low_est_threshold)); + } else { + for (; argc > 0; --argc, ++argv) { + input = fopen(argv[0], "rb"); + if (!input) { + fprintf(stderr, "entropy: fopen: %s: %s\n", argv[0], strerror(errno)); + return 1; + } + printf("%s:\t%f\n", argv[0], bit_entropy(input, bits, chain_len, low_est_threshold)); + fclose(input); + } + } +}