From: Geoffrey Allott Date: Tue, 23 Aug 2022 05:48:31 +0000 (+0100) Subject: first impl of decoding X-Git-Url: https://git.pointlesshacks.com/?a=commitdiff_plain;h=a5c5d037bc218cf5bc9a9c9a69f1cc4c2d1dfc59;p=tANS.git first impl of decoding --- diff --git a/src/stree.c b/src/stree.c index 66f9544..7e5a0f8 100644 --- a/src/stree.c +++ b/src/stree.c @@ -3,6 +3,7 @@ #include #include #include +#include struct node; @@ -47,56 +48,69 @@ static void node_add_son(struct node *self, struct node *son) son->brother = brother; } -static void node_split_son(struct node *self, uint8_t edge, size_t len, struct node *split, struct node *son, const uint8_t *str) +static void node_split_son(struct node *self, struct node *edge, struct node *brother, struct node *split, struct node *son, size_t len) { - struct node *node, *prev; - - for (prev = (struct node *) 0, node = self->son; node; prev = node, node = node->brother) { - if (str[node->from] == edge) { - split->from = node->from; - split->to = node->from + len; - split->brother = node->brother; - split->son = node; - split->link = (struct node *) 0; - - node->from = node->from + len; - node->brother = son; - - if (prev) - prev->brother = split; - else - self->son = split; - - return; - } - } -} + split->from = edge->from; + split->to = edge->from + len; + split->brother = edge->brother; + split->son = edge; + split->link = (struct node *) 0; -static bool node_edge_present(const struct node *self, const uint8_t *str, size_t i) -{ - const struct node *node; + edge->from = edge->from + len; + edge->brother = son; - for (node = self->son; node; node = node->brother) - if (str[node->from] == str[i]) - return true; - return false; + if (brother) + brother->brother = split; + else + self->son = split; } -static bool node_edge_next_present(const struct node *self, size_t len, const uint8_t *str, size_t i) +static struct node *node_edge_inv(struct node *self, uint8_t edge, const uint8_t *str, struct node **brother, bool seen[], uint8_t *code, bool record_seen) { - return str[self->from+len] == str[i]; -} + struct node *node; -static bool node_edge_end(const struct node *self, size_t len) -{ - return self && self->from + len >= self->to; + printf("calling node_edge_inv\n"); + + if (record_seen) { + for (node = self->son; node; node = node->brother) { + if (!seen[str[node->from]]) { + seen[str[node->from]] = true; + if (*code == 0) { + edge = str[node->from]; + break; + } + --*code; + printf("decrementing code to %d\n", *code); + } + } + if (!node) { + return (struct node *) 0; + } + } + + for (*brother = (struct node *) 0, node = self->son; node; *brother = node, node = node->brother) + if (str[node->from] == edge) + return node; + + return (struct node *) 0; } -static struct node *node_edge(struct node *self, uint8_t edge, const uint8_t *str) +static struct node *node_edge(struct node *self, uint8_t edge, const uint8_t *str, struct node **brother, bool seen[], uint8_t *code, bool record_seen) { struct node *node; - for (node = self->son; node; node = node->brother) + if (record_seen) { + for (node = self->son; node; node = node->brother) { + if (str[node->from] == edge) + break; + if (!seen[str[node->from]]) { + seen[str[node->from]] = true; + ++*code; + } + } + } + + for (*brother = (struct node *) 0, node = self->son; node; *brother = node, node = node->brother) if (str[node->from] == edge) return node; @@ -105,13 +119,12 @@ static struct node *node_edge(struct node *self, uint8_t edge, const uint8_t *st static bool node_validate_suffix(struct node *self, size_t len, const uint8_t *str, size_t i) { - struct node *node; + struct node *node, *brother; size_t j; if (len == 0) return true; - if (!node_edge_present(self, str, i)) return false; - - node = node_edge(self, str[i], str); + node = node_edge(self, str[i], str, &brother, (bool *) 0, (uint8_t *) 0, false); + if (!node) return false; for (j = node->from; j < node->to && len > 0; --len, ++j, ++i) { if (str[j] != str[i]) return false; } @@ -121,8 +134,9 @@ static bool node_validate_suffix(struct node *self, size_t len, const uint8_t *s static bool node_validate_suffixes(struct node *self, size_t len, const uint8_t *str, size_t i) { if (len == 0) return true; - if (!node_validate_suffixes(self, len - 1, str, i + 1)) return false; - return node_validate_suffix(self, len, str, i); + for (; len; ++i, --len) + if (!node_validate_suffix(self, len, str, i)) return false; + return true; } static size_t stree_max_size(size_t len) @@ -137,16 +151,20 @@ int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux) struct node *prev; struct node *active_node; struct node *active_edge = (struct node *) 0; + struct node *brother = (struct node *) 0; + struct node *new_leaf; size_t active_len = 0; size_t rem = 0; size_t i; size_t n; - bool cont; + bool present; + bool seen[256]; nodes = (struct node *) malloc(sizeof(struct node) * stree_max_size(len)); if (!nodes) return -1; root = nodes + 0; node_init(root, (size_t) -1); + root->link = root; n = 1; active_node = root; @@ -154,43 +172,45 @@ int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux) aux[i] = (size_t) (active_node - root); prev = (struct node *) 0; ++rem; - cont = false; - while (rem > 0 && !cont) { - if (active_len == 0) { - if (node_edge_present(active_node, in, i)) - cont = true; - else { + memset(seen, 0, sizeof seen); + out[i] = 0; + active_edge = node_edge(active_node, in[i-active_len], in, &brother, seen, out + i, active_len == 0); + do { + present = active_edge && in[active_edge->from+active_len] == in[i]; + if (present) { + ++active_len; + } else { + if (!active_edge) { node_init(nodes + n, i); node_add_son(active_node, nodes + n); - ++n; - --rem; - } - } else if (active_len > 0) { - if (node_edge_next_present(active_edge, active_len, in, i)) - cont = true; - else { + new_leaf = nodes + n; + ++n, --rem; + } else { + printf("seen? %d\n", seen[in[active_edge->from+active_len]]); + if (!seen[in[active_edge->from+active_len]]) { + seen[in[active_edge->from+active_len]] = true; + ++out[i]; + } node_init(nodes + n + 1, i); - node_split_son(active_node, in[active_edge->from], active_len, nodes + n, nodes + n + 1, in); + node_split_son(active_node, active_edge, brother, nodes + n, nodes + n + 1, active_len); if (prev) prev->link = nodes + n; - prev = nodes + n; - n += 2; - --rem; - if (active_node == root) { - --active_len; - } + prev = nodes + n, n += 2, --rem; + if (active_node == root) --active_len; + new_leaf = (struct node *) 0; } - } - if (!cont && active_node != root) { active_node = active_node->link ? active_node->link : (active_len = rem - 1, root); + active_edge = node_edge(active_node, in[i-active_len], in, &brother, seen, out + i, active_len == 0 && active_node->son != new_leaf); } - active_edge = node_edge(active_node, in[i-active_len], in); - if (cont) ++active_len; - while (node_edge_end(active_edge, active_len)) { + while (active_edge && active_edge->from + active_len >= active_edge->to) { active_node = active_edge; active_len -= active_node->to - active_node->from; - active_edge = node_edge(active_node, in[i-active_len], in); + active_edge = node_edge(active_node, in[i-active_len], in, &brother, seen, out + i, active_len == 0 && !present); } - } + } while (rem > 0 && !present); + if (!present) + for (int a = 0; a < in[i]; ++a) + if (!seen[a]) + ++out[i]; } #ifndef NDEBUG @@ -198,9 +218,100 @@ int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux) free(nodes); return -1; } + + //node_dbg(root, root, 0, in, len); +#endif + + free(nodes); + return 0; +} + +int stree_decode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux) +{ + struct node *nodes; + struct node *root; + struct node *prev; + struct node *active_node; + struct node *active_edge = (struct node *) 0; + struct node *brother = (struct node *) 0; + struct node *new_leaf; + size_t active_len = 0; + size_t rem = 0; + size_t i, n; + bool present; + bool seen[256]; + uint8_t code, a; + + nodes = (struct node *) malloc(sizeof(struct node) * stree_max_size(len)); + if (!nodes) return -1; + root = nodes + 0; + node_init(root, (size_t) -1); + root->link = root; + n = 1; + active_node = root; + + for (i = 0; i < len; ++i) { + aux[i] = (size_t) (active_node - root); + prev = (struct node *) 0; + ++rem; + memset(seen, 0, sizeof seen); + code = in[i]; + printf("i = %d - decoding %d\n", (int) i, code); + + active_edge = node_edge_inv(active_node, out[i-active_len], out, &brother, seen, &code, active_len == 0); + do { + printf("loop iteration: active_node = %d, active_edge = %d, active_len = %d\n", (int) (active_node - root), (int) (active_edge - root), (int) active_len); + present = active_edge && code == 0; + if (present) { + out[i] = out[active_edge->from+active_len]; + printf("setting out[%d] = %c\n", (int) i, (char) out[active_edge->from+active_len]); + ++active_len; + } else { + if (!active_edge) { + node_init(nodes + n, i); + node_add_son(active_node, nodes + n); + new_leaf = nodes + n; + ++n, --rem; + } else { + printf("seen? %d\n", seen[out[active_edge->from+active_len]]); + if (!seen[out[active_edge->from+active_len]]) { + seen[out[active_edge->from+active_len]] = true; + --code; + } + node_init(nodes + n + 1, i); + node_split_son(active_node, active_edge, brother, nodes + n, nodes + n + 1, active_len); + if (prev) prev->link = nodes + n; + prev = nodes + n, n += 2, --rem; + if (active_node == root) --active_len; + new_leaf = (struct node *) 0; + } + active_node = active_node->link ? active_node->link : (active_len = rem - 1, root); + active_edge = node_edge_inv(active_node, out[i-active_len], out, &brother, seen, &code, active_len == 0 && active_node->son != new_leaf); + printf("!setting active_edge = %d, active_len = %d\n", (int) (active_edge - root), (int) active_len); + } + while (active_edge && active_edge->from + active_len >= active_edge->to) { + active_node = active_edge; + active_len -= active_node->to - active_node->from; + active_edge = node_edge_inv(active_node, out[i-active_len], out, &brother, seen, &code, active_len == 0 && !present); + } + } while (rem > 0 && !present); + if (!present) + for (a = 0;; ++a) + if (!seen[a] && code-- == 0) { + out[i] = a; + break; + } + } + +#ifndef NDEBUG + if (!node_validate_suffixes(root, len, out, 0)) { + free(nodes); + return -1; + } + + node_dbg(root, root, 0, out, len); #endif - (void) out; // TODO free(nodes); return 0; } diff --git a/src/stree.h b/src/stree.h index fc4a686..ddff5e7 100644 --- a/src/stree.h +++ b/src/stree.h @@ -4,3 +4,4 @@ #include int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux); +int stree_decode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux); diff --git a/test/test_stree.c b/test/test_stree.c index 6b03f95..893b3c0 100644 --- a/test/test_stree.c +++ b/test/test_stree.c @@ -2,7 +2,7 @@ #include "stree.h" -enum test_result test_stree_encode_empty(void) +static enum test_result test_stree_encode_empty(void) { const uint8_t *in = (const uint8_t *) ""; uint8_t out[1]; @@ -11,18 +11,16 @@ enum test_result test_stree_encode_empty(void) return TEST_SUCCESS; } -enum test_result test_stree_encode_simple(void) +static enum test_result test_stree_encode_simple(void) { const uint8_t *in = (const uint8_t *) "abc"; uint8_t out[3]; size_t aux[3]; ASSERT_EQ(0, stree_encode(3, in, out, aux)); - /* - ASSERT_EQ(0, out[0]); - ASSERT_EQ(1, out[1]); - ASSERT_EQ(2, out[2]); - */ + ASSERT_EQ('a', out[0]); + ASSERT_EQ('b', out[1]); + ASSERT_EQ('c', out[2]); ASSERT_EQ(0, aux[0]); ASSERT_EQ(0, aux[1]); @@ -31,7 +29,7 @@ enum test_result test_stree_encode_simple(void) return TEST_SUCCESS; } -enum test_result test_stree_encode_nontrivial(void) +static enum test_result test_stree_encode_nontrivial(void) { const uint8_t *in = (const uint8_t *) "abaaa"; uint8_t out[5]; @@ -55,7 +53,7 @@ enum test_result test_stree_encode_nontrivial(void) return TEST_SUCCESS; } -enum test_result test_stree_encode_so_example(void) +static enum test_result test_stree_encode_so_example(void) { const uint8_t *in = (const uint8_t *) "abcabxabcd"; uint8_t out[10]; @@ -89,7 +87,7 @@ enum test_result test_stree_encode_so_example(void) return TEST_SUCCESS; } -enum test_result test_stree_tricky_suffix_link(void) +static enum test_result test_stree_tricky_suffix_link(void) { const uint8_t *in = (const uint8_t *) "cdddcdc"; uint8_t out[7]; @@ -117,7 +115,7 @@ enum test_result test_stree_tricky_suffix_link(void) return TEST_SUCCESS; } -enum test_result test_stree_minimal(void) +static enum test_result test_stree_minimal(void) { const uint8_t *in = (const uint8_t *) "abcdeabacacabb"; uint8_t out[14]; @@ -127,7 +125,7 @@ enum test_result test_stree_minimal(void) return TEST_SUCCESS; } -enum test_result test_stree_minimal_2(void) +static enum test_result test_stree_minimal_2(void) { const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccacc"; uint8_t out[24]; @@ -137,7 +135,7 @@ enum test_result test_stree_minimal_2(void) return TEST_SUCCESS; } -enum test_result test_stree_minimal_3(void) +static enum test_result test_stree_minimal_3(void) { const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccaccccaa"; uint8_t out[28]; @@ -147,7 +145,7 @@ enum test_result test_stree_minimal_3(void) return TEST_SUCCESS; } -enum test_result test_stree_minimal_4(void) +static enum test_result test_stree_minimal_4(void) { const uint8_t *in = (const uint8_t *) "dbbcaccbdbdbde"; uint8_t out[14]; @@ -157,7 +155,54 @@ enum test_result test_stree_minimal_4(void) return TEST_SUCCESS; } -enum test_result test_stree_long(void) +static enum test_result test_stree_minimal_5(void) +{ + const uint8_t *in = (const uint8_t *) "acaaeabdecd"; + uint8_t enc[11]; + uint8_t dec[11]; + size_t aux1[11]; + size_t aux2[11]; + ASSERT_EQ(0, stree_encode(11, in, enc, aux1)); + ASSERT_EQ(0, stree_decode(11, enc, dec, aux2)); + + ASSERT_EQ(in[0], dec[0]); + ASSERT_EQ(in[1], dec[1]); + ASSERT_EQ(in[2], dec[2]); + ASSERT_EQ(in[3], dec[3]); + ASSERT_EQ(in[4], dec[4]); + ASSERT_EQ(in[5], dec[5]); + ASSERT_EQ(in[6], dec[6]); + ASSERT_EQ(in[7], dec[7]); + ASSERT_EQ(in[8], dec[8]); + ASSERT_EQ(in[9], dec[9]); + ASSERT_EQ(in[10], dec[10]); + + ASSERT_EQ(aux1[0], aux2[0]); + ASSERT_EQ(aux1[1], aux2[1]); + ASSERT_EQ(aux1[2], aux2[2]); + ASSERT_EQ(aux1[3], aux2[3]); + ASSERT_EQ(aux1[4], aux2[4]); + ASSERT_EQ(aux1[5], aux2[5]); + ASSERT_EQ(aux1[6], aux2[6]); + ASSERT_EQ(aux1[7], aux2[7]); + ASSERT_EQ(aux1[8], aux2[8]); + ASSERT_EQ(aux1[9], aux2[9]); + ASSERT_EQ(aux1[10], aux2[10]); + + return TEST_SUCCESS; +} + +static enum test_result test_stree_repeating(void) +{ + const uint8_t *in = (const uint8_t *) "abcabcabcabcabcdabcabcabcabcabcdababababab"; + uint8_t out[42]; + size_t aux[42]; + ASSERT_EQ(0, stree_encode(42, in, out, aux)); + + return TEST_SUCCESS; +} + +static enum test_result test_stree_long(void) { const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccaccccaabbbaababdadbaccabbdadbadbabaccacbbbcbadddbdababddddddabdabddddddabbbbccc"; uint8_t out[100]; @@ -167,7 +212,7 @@ enum test_result test_stree_long(void) return TEST_SUCCESS; } -enum test_result test_stree_very_long(void) +static enum test_result test_stree_very_long(void) { const uint8_t *in = (const uint8_t *) "acaaeabdecdcebcadbebddeaebeceacbbbaeeedbddeedcbdaaeddbadabaedeae" @@ -1195,15 +1240,112 @@ enum test_result test_stree_very_long(void) "ceddaeabcbbbadcbdbeaaedcceaeddbbbebeabaedebccceeebeeabacdabcddcd" "dcddcbddcdadaadbbadeedabadbaaeeeeacbaeacdecabcbdccecdededdadddec" ; - uint8_t out[65536]; - size_t aux[65536]; - size_t i; + uint8_t enc[65536]; + uint8_t dec[65536]; + size_t aux1[65536]; + size_t aux2[65536]; + size_t i, j; for (i = 0; i < 1024; ++i) { - ASSERT_EQ(0, stree_encode(64, in + i * 64, out, aux)); + ASSERT_EQ(0, stree_encode(64, in + i * 64, enc, aux1)); + ASSERT_EQ(0, stree_decode(64, enc, dec, aux2)); + for (j = 0; j < 64; ++j) { + ASSERT_EQ(in[i*64+j], dec[j]); + ASSERT_EQ(aux1[j], aux2[j]); + } } - ASSERT_EQ(0, stree_encode(65536, in, out, aux)); + ASSERT_EQ(0, stree_encode(65536, in, enc, aux1)); + ASSERT_EQ(0, stree_decode(65536, enc, dec, aux2)); + + return TEST_SUCCESS; +} + +static enum test_result test_stree_decode_empty(void) +{ + const uint8_t *in = (const uint8_t *) ""; + uint8_t out[1]; + size_t aux[1]; + ASSERT_EQ(0, stree_decode(0, in, out, aux)); + return TEST_SUCCESS; +} + +static enum test_result test_stree_decode_simple(void) +{ + const uint8_t *in = (const uint8_t *) "abc"; + uint8_t out[3]; + size_t aux[3]; + ASSERT_EQ(0, stree_decode(3, in, out, aux)); + + ASSERT_EQ('a', out[0]); + ASSERT_EQ('b', out[1]); + ASSERT_EQ('c', out[2]); + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + + return TEST_SUCCESS; +} + +static enum test_result test_stree_decode_nontrivial(void) +{ + const uint8_t *in = (const uint8_t *) "ab\1\1\1"; + uint8_t out[5]; + size_t aux[5]; + ASSERT_EQ(0, stree_decode(5, in, out, aux)); + + ASSERT_EQ('a', out[0]); + ASSERT_EQ('b', out[1]); + ASSERT_EQ('a', out[2]); + ASSERT_EQ('a', out[3]); + ASSERT_EQ('a', out[4]); + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + ASSERT_EQ(0, aux[3]); + ASSERT_EQ(3, aux[4]); + + return TEST_SUCCESS; +} + +static enum test_result test_stree_roundtrip_so_example(void) +{ + int i; + const uint8_t *in = (const uint8_t *) "abcabxabcd"; + uint8_t enc[10]; + uint8_t dec[10]; + size_t aux1[10]; + size_t aux2[10]; + ASSERT_EQ(0, stree_encode(10, in, enc, aux1)); + ASSERT_EQ(0, stree_decode(10, enc, dec, aux2)); + + for (i = 0; i < 10; ++i) + printf("%d ", enc[i]); + printf("\n"); + + ASSERT_EQ(in[0], dec[0]); + ASSERT_EQ(in[1], dec[1]); + ASSERT_EQ(in[2], dec[2]); + ASSERT_EQ(in[3], dec[3]); + ASSERT_EQ(in[4], dec[4]); + ASSERT_EQ(in[5], dec[5]); + ASSERT_EQ(in[6], dec[6]); + ASSERT_EQ(in[7], dec[7]); + ASSERT_EQ(in[8], dec[8]); + ASSERT_EQ(in[9], dec[9]); + + ASSERT_EQ(aux1[0], aux2[0]); + ASSERT_EQ(aux1[1], aux2[1]); + ASSERT_EQ(aux1[2], aux2[2]); + ASSERT_EQ(aux1[3], aux2[3]); + ASSERT_EQ(aux1[4], aux2[4]); + ASSERT_EQ(aux1[5], aux2[5]); + ASSERT_EQ(aux1[6], aux2[6]); + ASSERT_EQ(aux1[7], aux2[7]); + ASSERT_EQ(aux1[8], aux2[8]); + ASSERT_EQ(aux1[9], aux2[9]); return TEST_SUCCESS; } @@ -1219,6 +1361,12 @@ int main(void) RUN_TEST(test_stree_minimal_2); RUN_TEST(test_stree_minimal_3); RUN_TEST(test_stree_minimal_4); + RUN_TEST(test_stree_minimal_5); + RUN_TEST(test_stree_repeating); RUN_TEST(test_stree_long); RUN_TEST(test_stree_very_long); + RUN_TEST(test_stree_decode_empty); + RUN_TEST(test_stree_decode_simple); + RUN_TEST(test_stree_decode_nontrivial); + RUN_TEST(test_stree_roundtrip_so_example); }