#include <math.h>
#define MAX_BUFSZ 1048576
+#define INIT_READSZ 1024
#define N_SYMBOLS 256
#define N_AUX 3
+#define LOG2_TBLSZ 12
static void usage(void)
{
printf(
- "usage: tANS [-hcdz] [-S .suf] [file...]\n"
+ "usage: tANS [-hcdstz] [-S .suf] [file...]\n"
"\n"
"Compress the given files using suffix trees and tabled Asymmetric Numeral Systems\n"
"\n"
" -h - Show this help text\n"
" -c - Send output to stdout\n"
" -d - Decompress\n"
+ " -s - Perform suffix tree transformation only\n"
+ " -t - Perform tANS-rl (tabled Asymmetric Numeral Systems with run-length encoding) compression only\n"
" -z - Compress (default)\n"
"\n"
" -S .suf - Use the given suffix instead of \".ans\"\n"
return len * 2;
}
-static void tANS_set_default_probabilities(double p[static 3][256])
-{
- int i;
-
- p[0][0] = 25.0;
- p[0][1] = 15.0;
- p[0][2] = 5.0;
- p[0][3] = 4.0;
- p[0][4] = 3.0;
- p[0][5] = 2.0;
- p[0][6] = 1.0;
-
- for (i = 7; i < 256; ++i)
- p[0][i] = 0.0;
-
- p[1][0] = 40.0;
- p[1][1] = 20.0;
- p[1][2] = 15.0;
- p[1][3] = 5.0;
- p[1][4] = 4.0;
- p[1][5] = 3.0;
- p[1][6] = 2.0;
- p[1][7] = 1.0;
-
- for (i = 8; i < 256; ++i)
- p[1][i] = 0.0;
-
- p[2][0] = 0.0;
- p[2][1] = 20.0;
- p[2][2] = 10.0;
- p[2][3] = 5.0;
- p[2][4] = 4.0;
- p[2][5] = 3.0;
- p[2][6] = 2.0;
- p[2][7] = 1.0;
-
- for (i = 8; i < 256; ++i)
- p[0][i] = 0.0;
-}
-
static int tANS_compress_file(FILE* input, FILE *output)
{
- uint32_t i, len, bits;
+ uint32_t i, len, bits, count;
uint8_t *read_buf;
uint8_t *enc_buf;
uint8_t *aux_buf;
struct tANS_freq_tbl *freq_tbls;
struct tANS_symbol_tbl *symbol_tbls;
struct tANS_rl_encode_st *st;
- const uint16_t log2_tblsz = 10;
+ const uint16_t log2_tblsz = LOG2_TBLSZ;
uint32_t read_sz = 1024;
uint32_t magic = TANS_MAGIC;
if (!freq_tbls || !symbol_tbls || !st || !read_buf || !enc_buf || !aux_buf || !write_buf) goto fail;
- tANS_set_default_probabilities(p);
-
if (fwrite(&magic, sizeof magic, 1, output) != 1) goto fail;
while (!feof(input)) {
len = (uint32_t) fread(read_buf, 1, read_sz, input);
if (stree_encode(len, read_buf, enc_buf, aux_buf) != 0) goto fail;
+
for (i = 0; i < len; ++i) {
++p[0][enc_buf[i]];
if (enc_buf[i] == 0) {
- size_t count = 0;
- while (enc_buf[++i] == 0 && count < 255) ++count;
- ++p[1][count];
- --i;
- } else {
+ while (enc_buf[i] == 0) {
+ count = 0;
+ while (enc_buf[++i] == 0 && count < 255) ++count;
+ ++p[1][count];
+ }
++p[2][enc_buf[i]];
}
}
+
if (fwrite(&len, sizeof len, 1, output) != 1) goto fail;
bits = tANS_rl_encode(st, enc_buf, len, write_buf);
if (fwrite(&bits, sizeof bits, 1, output) != 1) goto fail;
static int tANS_decompress_file(FILE* input, FILE *output)
{
- uint32_t i, len, bits;
+ uint32_t i, len, bits, count;
uint8_t *read_buf;
uint8_t *enc_buf;
uint8_t *aux_buf;
struct tANS_freq_tbl *freq_tbls;
struct tANS_symbol_tbl *symbol_tbls;
struct tANS_rl_decode_st *st;
- const uint16_t log2_tblsz = 10;
+ const uint16_t log2_tblsz = LOG2_TBLSZ;
uint32_t magic;
freq_tbls = malloc(sizeof(struct tANS_freq_tbl) * N_AUX);
symbol_tbls = malloc(sizeof(struct tANS_symbol_tbl) * N_AUX);
- st = malloc(sizeof(struct tANS_rl_encode_st));
+ st = malloc(sizeof(struct tANS_rl_decode_st));
read_buf = malloc(tANS_max_compressed_size(MAX_BUFSZ));
enc_buf = malloc(MAX_BUFSZ);
aux_buf = malloc(MAX_BUFSZ);
goto fail;
}
- tANS_set_default_probabilities(p);
-
while (!feof(input)) {
for (i = 0; i < N_AUX; ++i) {
if (tANS_freq_tbl_init(freq_tbls + i, N_SYMBOLS, p[i], log2_tblsz) != 0) goto fail;
for (i = 0; i < len; ++i) {
++p[0][enc_buf[i]];
if (enc_buf[i] == 0) {
- size_t count = 0;
- while (enc_buf[++i] == 0 && count < 255) ++count;
- ++p[1][count];
- --i;
- } else {
+ while (enc_buf[i] == 0) {
+ count = 0;
+ while (enc_buf[++i] == 0 && count < 255) ++count;
+ ++p[1][count];
+ }
++p[2][enc_buf[i]];
}
}
return -1;
}
+static int stree_compress_file(FILE *input, FILE *output, uint32_t read_sz)
+{
+ uint32_t len;
+ uint8_t *read_buf;
+ uint8_t *write_buf;
+ uint8_t *aux_buf;
+
+ read_buf = malloc(read_sz);
+ write_buf = malloc(read_sz);
+ aux_buf = malloc(read_sz);
+
+ if (!read_buf || !write_buf || !aux_buf) goto fail;
+
+ while (!feof(input)) {
+ len = (uint32_t) fread(read_buf, 1, read_sz, input);
+ if (stree_encode(len, read_buf, write_buf, aux_buf) != 0) goto fail;
+ if (fwrite(write_buf, len, 1, output) != 1) goto fail;
+ }
+
+ free(read_buf);
+ free(write_buf);
+ free(aux_buf);
+ return 0;
+
+fail:
+ free(read_buf);
+ free(write_buf);
+ free(aux_buf);
+ return -1;
+}
+
+static int stree_decompress_file(FILE *input, FILE *output, uint32_t read_sz)
+{
+ uint32_t len;
+ uint8_t *read_buf;
+ uint8_t *write_buf;
+ uint8_t *aux_buf;
+
+ read_buf = malloc(read_sz);
+ write_buf = malloc(read_sz);
+ aux_buf = malloc(read_sz);
+
+ if (!read_buf || !write_buf || !aux_buf) goto fail;
+
+ while (!feof(input)) {
+ len = (uint32_t) fread(read_buf, 1, read_sz, input);
+ if (stree_decode(len, read_buf, write_buf, aux_buf) != 0) goto fail;
+ if (fwrite(write_buf, len, 1, output) != 1) goto fail;
+ }
+
+ free(read_buf);
+ free(write_buf);
+ free(aux_buf);
+ return 0;
+
+fail:
+ free(read_buf);
+ free(write_buf);
+ free(aux_buf);
+ return -1;
+}
+
+enum mode {
+ mode_both,
+ mode_stree_only,
+ mode_tans_only,
+};
+
int main(int argc, char *argv[])
{
int ret, opt, to_stdout = 0, compress = 1;
+ enum mode mode = mode_both;
FILE *input = stdin, *output = stdout;
const char *suffix = ".ans";
char outpath[1024];
- while ((opt = getopt(argc, argv, "hcdzS:")) != -1) {
+ while ((opt = getopt(argc, argv, "hcdstzS:")) != -1) {
switch (opt) {
case 'h':
usage();
case 'd':
compress = 0;
break;
+ case 's':
+ mode = mode_stree_only;
+ break;
+ case 't':
+ mode = mode_tans_only;
+ break;
case 'z':
compress = 1;
break;
if (argc == 0) {
if (compress) {
- return tANS_compress_file(input, output) != 0;
+ if (mode == mode_stree_only) {
+ return stree_compress_file(input, output, MAX_BUFSZ) != 0;
+ } else {
+ return tANS_compress_file(input, output) != 0;
+ }
} else {
- return tANS_decompress_file(input, output) != 0;
+ if (mode == mode_stree_only) {
+ return stree_decompress_file(input, output, MAX_BUFSZ) != 0;
+ } else {
+ return tANS_decompress_file(input, output) != 0;
+ }
}
} else {
for (; argc >= 1; --argc, ++argv) {
}
if (compress) {
- ret = tANS_compress_file(input, output);
+ if (mode == mode_stree_only) {
+ ret = stree_compress_file(input, output, MAX_BUFSZ) != 0;
+ } else {
+ ret = tANS_compress_file(input, output) != 0;
+ }
} else {
- ret = tANS_decompress_file(input, output);
+ if (mode == mode_stree_only) {
+ ret = stree_decompress_file(input, output, MAX_BUFSZ) != 0;
+ } else {
+ ret = tANS_decompress_file(input, output);
+ }
}
fclose(input);
--- /dev/null
+#include <errno.h>
+#include <getopt.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static double bit_entropy(FILE *input, int bits, int chain_len, int low_est_threshold)
+{
+ size_t *counts;
+ uint8_t *chain;
+ size_t i, j, k, len, index, subtotal, total;
+ double p, entropy;
+ uint8_t buf[65536];
+
+ counts = calloc((size_t) 1 << (bits * chain_len), sizeof(size_t));
+ chain = calloc((size_t) chain_len, sizeof(uint8_t));
+
+ if (!counts || !chain) goto fail;
+
+ while (!feof(input)) {
+ if ((len = fread(buf, 1, sizeof buf, input)) == 0 && !feof(input)) goto fail;
+ for (i = 0; i < len; ++i) {
+ for (j = 0; j < 8; j += (size_t) bits) {
+ chain[chain_len-1] = (uint8_t) ((buf[i] >> j) & ((1 << bits) - 1));
+ for (k = 0, index = 0; k < (size_t) chain_len; ++k) {
+ index <<= bits;
+ index |= chain[k];
+ }
+
+ ++counts[index];
+
+ for (k = 0; k + 1 < (size_t) chain_len; ++k)
+ chain[k] = chain[k+1];
+ }
+ }
+ }
+
+ entropy = 0;
+ total = 0;
+ for (i = 0; i < ((size_t) 1 << (bits * (chain_len - 1))); ++i) {
+ for (j = 0, subtotal = 0; j < ((size_t) 1 << bits); ++j)
+ subtotal += counts[(i << bits) + j];
+ total += subtotal;
+ if (subtotal < (size_t) low_est_threshold) {
+ entropy += (double) ((size_t) bits * subtotal);
+ } else {
+ for (j = 0; j < ((size_t) 1 << bits); ++j) {
+ if (counts[(i << bits) + j] > 0) {
+ p = (double) counts[(i << bits) + j] / (double) subtotal;
+ entropy -= log2(p) * (double) counts[(i << bits) + j];
+ }
+ }
+ }
+ }
+
+ entropy /= (double) (total * (size_t) bits);
+
+ free(counts);
+ free(chain);
+ return entropy;
+
+fail:
+ free(counts);
+ free(chain);
+ return 0.0/0.0;
+}
+
+static void usage(void)
+{
+ printf(
+ "usage: entropy [-h] [-b bits] [-c len] [-t threshold] [file...]\n"
+ "\n"
+ "Compress the given files using suffix trees and tabled Asymmetric Numeral Systems\n"
+ "\n"
+ " -h - Show this help text\n"
+ " -b - Decode the specified number of bits at a time (1, 2, 4, or (default) 8)\n"
+ " -c - Keep a markov chain `len' symbols long (default 1)\n"
+ " -t - Estimate maximal entropy if subtotal < `threshold' (default 0)\n"
+ );
+}
+
+int main(int argc, char *argv[])
+{
+ int opt, bits = 8, chain_len = 1, low_est_threshold = 0;
+ FILE *input;
+
+ while ((opt = getopt(argc, argv, "hb:c:t:")) != -1) {
+ switch (opt) {
+ case 'h':
+ usage();
+ return 0;
+ case 'b':
+ bits = atoi(optarg);
+ break;
+ case 'c':
+ chain_len = atoi(optarg);
+ break;
+ case 't':
+ low_est_threshold = atoi(optarg);
+ break;
+ }
+ }
+
+ if ((bits != 1 && bits != 2 && bits != 4 && bits != 8) || chain_len < 1 || low_est_threshold < 0) {
+ usage();
+ return 1;
+ }
+
+ argv += optind;
+ argc -= optind;
+
+ if (argc == 0) {
+ printf("%f\n", bit_entropy(stdin, bits, chain_len, low_est_threshold));
+ } else {
+ for (; argc > 0; --argc, ++argv) {
+ input = fopen(argv[0], "rb");
+ if (!input) {
+ fprintf(stderr, "entropy: fopen: %s: %s\n", argv[0], strerror(errno));
+ return 1;
+ }
+ printf("%s:\t%f\n", argv[0], bit_entropy(input, bits, chain_len, low_est_threshold));
+ fclose(input);
+ }
+ }
+}