improve compression ratios by: resetting probability each block; reducing the size...
authorGeoffrey Allott <geoffrey@allott.email>
Sat, 3 Sep 2022 16:55:30 +0000 (17:55 +0100)
committerGeoffrey Allott <geoffrey@allott.email>
Sat, 3 Sep 2022 16:55:30 +0000 (17:55 +0100)
src/tANS.c

index c81fa2a4751df1d9df4e4f0d82b5d1e8f13c456e..0a24c8a8e019f56c4d62b402adc49f680eea7bf4 100644 (file)
@@ -15,7 +15,7 @@ ref: https://arxiv.org/abs/1311.2540
 #include <string.h>
 #include <unistd.h>
 
-#define MAX_BUFSZ    1048576
+#define MAX_BUFSZ    16384
 #define STREE_READSZ 16777216
 #define INIT_READSZ  1024
 #define N_SYMBOLS    256
@@ -86,12 +86,13 @@ static int stree_tANS_compress_file(FILE* input, FILE *output)
         for (i = 0; i < len; ++i) {
             ++p[0][enc_buf[i]];
             if (enc_buf[i] == 0) {
-                while (enc_buf[i] == 0) {
+                do {
                     count = 0;
-                    while (enc_buf[++i] == 0 && count < 255) ++count;
+                    while (i < len - 1 && enc_buf[++i] == 0 && count < 255) ++count;
                     ++p[1][count];
-                }
-                ++p[2][enc_buf[i]];
+                } while (i < len - 1 && enc_buf[i] == 0);
+                if (i < len)
+                    ++p[2][enc_buf[i]];
             }
         }
 
@@ -181,12 +182,13 @@ static int stree_tANS_decompress_file(FILE* input, FILE *output)
         for (i = 0; i < len; ++i) {
             ++p[0][enc_buf[i]];
             if (enc_buf[i] == 0) {
-                while (enc_buf[i] == 0) {
+                do {
                     count = 0;
-                    while (enc_buf[++i] == 0 && count < 255) ++count;
+                    while (i < len - 1 && enc_buf[++i] == 0 && count < 255) ++count;
                     ++p[1][count];
-                }
-                ++p[2][enc_buf[i]];
+                } while (i < len - 1 && enc_buf[i] == 0);
+                if (i < len)
+                    ++p[2][enc_buf[i]];
             }
         }
     }
@@ -213,7 +215,7 @@ fail:
 
 static int tANS_compress_file(FILE* input, FILE *output)
 {
-    uint32_t i, len, bits, count;
+    uint32_t i, len, bits, count, len_and_bits;
     uint8_t *read_buf;
     uint8_t *write_buf;
     double p[N_AUX][N_SYMBOLS] = {0};
@@ -221,7 +223,7 @@ static int tANS_compress_file(FILE* input, FILE *output)
     struct tANS_symbol_tbl *symbol_tbls;
     struct tANS_rl_encode_st *st;
     const uint16_t log2_tblsz = LOG2_TBLSZ;
-    uint32_t read_sz = 1024;
+    uint32_t read_sz = INIT_READSZ;
     uint32_t magic = TANS_ONLY_MAGIC;
 
     freq_tbls = malloc(sizeof(struct tANS_freq_tbl) * N_AUX);
@@ -244,21 +246,25 @@ static int tANS_compress_file(FILE* input, FILE *output)
         len = (uint32_t) fread(read_buf, 1, read_sz, input);
         if (len == 0) break;
 
+        memset(p, 0, sizeof p);
+
         for (i = 0; i < len; ++i) {
             ++p[0][read_buf[i]];
             if (read_buf[i] == 0) {
-                while (read_buf[i] == 0) {
+                do {
                     count = 0;
-                    while (read_buf[++i] == 0 && count < 255) ++count;
+                    while (i < len - 1 && read_buf[++i] == 0 && count < 255) ++count;
                     ++p[1][count];
-                }
-                ++p[2][read_buf[i]];
+                } while (i < len - 1 && read_buf[i] == 0);
+                if (i < len)
+                    ++p[2][read_buf[i]];
             }
         }
 
-        if (fwrite(&len, sizeof len, 1, output) != 1) goto fail;
-        bits = tANS_rl_encode(st, read_buf, len, write_buf);
-        if (fwrite(&bits, sizeof bits, 1, output) != 1) goto fail;
+        st->x += read_buf[len-1];
+        bits = tANS_rl_encode(st, read_buf, len - 1, write_buf);
+        len_and_bits = (len - 1) | (bits << 14);
+        if (fwrite(&len_and_bits, sizeof len_and_bits, 1, output) != 1) goto fail;
         if (fwrite(&st->x, sizeof st->x, 1, output) != 1) goto fail;
         if (fwrite(write_buf, (bits + 7) / 8, 1, output) != 1) goto fail;
 
@@ -288,7 +294,7 @@ fail:
 
 static int tANS_decompress_file(FILE* input, FILE *output)
 {
-    uint32_t i, len, bits, count;
+    uint32_t i, len, bits, count, len_and_bits;
     uint8_t *read_buf;
     uint8_t *write_buf;
     double p[N_AUX][N_SYMBOLS] = {0};
@@ -319,26 +325,31 @@ static int tANS_decompress_file(FILE* input, FILE *output)
         }
         tANS_rl_decode_st_init(st, symbol_tbls);
 
-        if (fread(&len, sizeof len, 1, input) != 1) break;
-        if (fread(&bits, sizeof bits, 1, input) != 1) goto fail;
+        if (fread(&len_and_bits, sizeof len_and_bits, 1, input) != 1) break;
+        len = (len_and_bits & ((1 << 14) - 1)) + 1;
+        bits = len_and_bits >> 14;
         if (fread(&st->x, sizeof st->x, 1, input) != 1) goto fail;
         if (fread(read_buf + 4, (bits + 7) / 8, 1, input) != 1) goto fail;
         st->x &= symbol_tbls[0].tblsz - 1;
-        bits = tANS_rl_decode(st, write_buf, len, read_buf + 4, bits);
+        bits = tANS_rl_decode(st, write_buf, len - 1, read_buf + 4, bits);
         if (bits != 0) {
             fprintf(stderr, "tANS: corrupted file\n");
             goto fail;
         }
+        write_buf[len-1] = (uint8_t) st->x;
         if (fwrite(write_buf, len, 1, output) != 1) goto fail;
+        memset(p, 0, sizeof p);
+
         for (i = 0; i < len; ++i) {
             ++p[0][write_buf[i]];
             if (write_buf[i] == 0) {
-                while (write_buf[i] == 0) {
+                do {
                     count = 0;
-                    while (write_buf[++i] == 0 && count < 255) ++count;
+                    while (i < len - 1 && write_buf[++i] == 0 && count < 255) ++count;
                     ++p[1][count];
-                }
-                ++p[2][write_buf[i]];
+                } while (i < len - 1 && write_buf[i] == 0);
+                if (i < len)
+                    ++p[2][write_buf[i]];
             }
         }
     }