From 94063b79e31da540421ea6975c7f7f83340bcdd1 Mon Sep 17 00:00:00 2001 From: Geoffrey Allott Date: Sat, 20 Aug 2022 17:14:37 +0100 Subject: [PATCH] initial stree implementation --- src/stree.c | 281 ++++++++++++++++++++++++++++++++++++++++++++++ src/stree.h | 6 + test/test_stree.c | 182 ++++++++++++++++++++++++++++++ 3 files changed, 469 insertions(+) create mode 100644 src/stree.c create mode 100644 src/stree.h create mode 100644 test/test_stree.c diff --git a/src/stree.c b/src/stree.c new file mode 100644 index 0000000..ff94e1d --- /dev/null +++ b/src/stree.c @@ -0,0 +1,281 @@ +#include "stree.h" + +#include +#include +#include + +struct node; + +struct node { + size_t from; + size_t to; + struct node *brother; + struct node *son; + struct node *father; + struct node *link; +}; + +static void node_init(struct node *self, size_t from) +{ + self->from = from; + self->to = (size_t) -1; + self->brother = (struct node *) 0; + self->son = (struct node *) 0; + self->father = (struct node *) 0; + self->link = (struct node *) 0; +} + +static void node_dbg(const struct node *self, size_t indent, const uint8_t *str, size_t len) +{ + const struct node *node; + size_t i; + + for (i = 0; i < indent; ++i) + printf(" "); + printf("%d->%d \"", (int) self->from, (int) self->to); + for (i = self->from; i < self->to && i < len; ++i) + printf("%c", (char) str[i]); + printf("\"\n"); + for (node = self->son; node; node = node->brother) + node_dbg(node, indent + 2, str, len); +} + +static void node_add_son(struct node *self, struct node *son) +{ + struct node *brother; + + brother = self->son; + self->son = son; + son->brother = brother; + son->father = self; +} + +static void node_split_son(struct node *self, uint8_t edge, size_t len, struct node *split, struct node *son, const uint8_t *str) +{ + struct node *node, *prev; + + for (prev = (struct node *) 0, node = self->son; node; prev = node, node = node->brother) { + if (str[node->from] == edge) { + split->from = node->from; + split->to = node->from + len; + split->brother = node->brother; + split->son = node; + split->father = self; + split->link = (struct node *) 0; + + node->from = node->from + len; + node->father = split; + node->brother = son; + + son->father = split; + + if (prev) + prev->brother = split; + else + self->son = split; + + return; + } + } +} + +static bool node_edge_present(const struct node *self, const uint8_t *str, size_t i) +{ + const struct node *node; + + for (node = self->son; node; node = node->brother) + if (str[node->from] == str[i]) + return true; + return false; +} + +static bool node_edge_present_2(const struct node *self, uint8_t edge, size_t len, const uint8_t *str, size_t i) +{ + const struct node *node; + + for (node = self->son; node; node = node->brother) + if (str[node->from] == edge) + return str[node->from+len] == str[i]; + return false; +} + +static bool node_edge_end(const struct node *self, uint8_t edge, size_t len, const uint8_t *str) +{ + const struct node *node; + + for (node = self->son; node; node = node->brother) + if (str[node->from] == edge) + return node->from + len >= node->to; + + return false; +} + +static struct node *node_edge(struct node *self, uint8_t edge, const uint8_t *str) +{ + struct node *node; + + for (node = self->son; node; node = node->brother) + if (str[node->from] == edge) + return node; + + return (struct node *) 0; +} + +static bool node_validate_suffix(struct node *self, size_t len, const uint8_t *str, size_t i) +{ + struct node *node; + size_t j; + + if (len == 0) return true; + if (!node_edge_present(self, str, i)) return false; + + node = node_edge(self, str[i], str); + for (j = node->from; j < node->to && len > 0; --len, ++j, ++i) { + if (str[j] != str[i]) return false; + } + return node_validate_suffix(node, len, str, i); +} + +static bool node_validate_suffixes(struct node *self, size_t len, const uint8_t *str, size_t i) +{ + if (len == 0) return true; + if (!node_validate_suffixes(self, len - 1, str, i + 1)) return false; + return node_validate_suffix(self, len, str, i); +} + +static size_t stree_max_size(size_t len) +{ + return len * 2 + 1; +} + +int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux) +{ + struct node *nodes; + struct node *root; + struct node *prev; + struct node *active_node; + uint8_t active_edge = '^'; + size_t active_len = 0; + size_t rem = 0; + size_t i; + size_t n; + + nodes = (struct node *) malloc(sizeof(struct node) * stree_max_size(len)); + if (!nodes) return -1; + root = nodes + 0; + node_init(root, (size_t) -1); + n = 1; + active_node = root; + + for (i = 0; i < len; ++i) { + if (!node_validate_suffixes(root, i, in, 0)) { + free(nodes); + return -1; + } + aux[i] = (size_t) (active_node - root); + prev = (struct node *) 0; + ++rem; + while (rem > 0) { + printf("i = %d, active_node = %d, active_edge = %c, active_len = %d, rem = %d\n", (int) i, (int) (active_node - root), (char) active_edge, (int) active_len, (int) (rem - 1)); + node_dbg(root, 0, in, i); + if (active_len == 0) { + if (!node_edge_present(active_node, in, i)) { + printf("creating node %d as child of %d\n", (int) n, (int) (active_node - root)); + printf("remainder is currently %d\n", (int) rem); + printf("i is currently %d\n", (int) i); + node_init(nodes + n, i); + node_add_son(active_node, nodes + n); + ++n; + --rem; + if (active_node != root) { + if (active_node->link) { + printf("following link from node %d to %d\n", (int) (active_node - root), (int) (active_node->link - root)); + } + active_node = active_node->link ? active_node->link : root; + if (active_node == root) { + active_len = rem - 1; + active_edge = in[i-active_len]; + while (node_edge_end(active_node, active_edge, active_len, in)) { + active_node = node_edge(active_node, active_edge, in); + active_edge = in[i-active_len+1]; + printf("setting active_edge in first loop to %c\n", (char) active_edge); + active_len -= active_node->to - active_node->from; + if (active_len == 0) active_edge = '^'; + } + } + } + } else { + printf("active_edge is currently %c\n", (char) active_edge); + active_edge = in[i]; + printf("setting active_edge in first branch to %c\n", (char) active_edge); + printf("i is currently %d\n", (int) i); + ++active_len; + if (node_edge_end(active_node, active_edge, active_len, in)) { + active_node = node_edge(active_node, active_edge, in); + active_edge = '^'; + active_len = 0; + } + break; + } + } else if (active_len > 0) { + if (!node_edge_present_2(active_node, active_edge, active_len, in, i)) { + printf("creating nodes %d and %d\n", (int) n, (int) n + 1); + node_init(nodes + n + 1, i); + node_split_son(active_node, active_edge, active_len, nodes + n, nodes + n + 1, in); + if (prev) { + prev->link = nodes + n; + printf("creating link from node %d to node %d\n", (int) (prev - root), (int) n); + } + prev = nodes + n; + n += 2; + --rem; + printf("active_edge is currently %c\n", (char) active_edge); + if (active_node == root) { + --active_len; + printf("setting active_edge in second branch\n"); + active_edge = in[i-active_len]; + if (active_len == 0) active_edge = '^'; + } else { + if (active_node->link) { + printf("following link from node %d to %d\n", (int) (active_node - root), (int) (active_node->link - root)); + } + active_node = active_node->link ? active_node->link : root; + if (active_node == root) { + active_len = rem - 1; + active_edge = in[i-active_len]; + } + } + printf("active_edge is currently %c\n", (char) active_edge); + printf("remainder is currently %d\n", (int) rem); + printf("active_node is currently %d\n", (int) (active_node - root)); + while (node_edge_end(active_node, active_edge, active_len, in)) { + active_node = node_edge(active_node, active_edge, in); + active_edge = in[i-active_len+1]; + printf("setting active_edge in second loop to %c\n", (char) active_edge); + active_len -= active_node->to - active_node->from; + if (active_len == 0) active_edge = '^'; + } + } else { + ++active_len; + if (node_edge_end(active_node, active_edge, active_len, in)) { + active_node = node_edge(active_node, active_edge, in); + active_edge = '^'; + active_len = 0; + } + break; + } + } + } + } + + node_dbg(root, 0, in, len); + + if (!node_validate_suffixes(root, len, in, 0)) { + free(nodes); + return -1; + } + + (void) out; // TODO + free(nodes); + return 0; +} diff --git a/src/stree.h b/src/stree.h new file mode 100644 index 0000000..fc4a686 --- /dev/null +++ b/src/stree.h @@ -0,0 +1,6 @@ +#pragma once + +#include +#include + +int stree_encode(size_t len, const uint8_t *in, uint8_t *out, size_t *aux); diff --git a/test/test_stree.c b/test/test_stree.c new file mode 100644 index 0000000..7c5892a --- /dev/null +++ b/test/test_stree.c @@ -0,0 +1,182 @@ +#include "test.h" + +#include "stree.h" + +enum test_result test_stree_encode_empty(void) +{ + const uint8_t *in = (const uint8_t *) ""; + uint8_t out[1]; + size_t aux[1]; + ASSERT_EQ(0, stree_encode(0, in, out, aux)); + return TEST_SUCCESS; +} + +enum test_result test_stree_encode_simple(void) +{ + const uint8_t *in = (const uint8_t *) "abc"; + uint8_t out[3]; + size_t aux[3]; + ASSERT_EQ(0, stree_encode(3, in, out, aux)); + + /* + ASSERT_EQ(0, out[0]); + ASSERT_EQ(1, out[1]); + ASSERT_EQ(2, out[2]); + */ + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + + return TEST_SUCCESS; +} + +enum test_result test_stree_encode_nontrivial(void) +{ + const uint8_t *in = (const uint8_t *) "abaaa"; + uint8_t out[5]; + size_t aux[5]; + ASSERT_EQ(0, stree_encode(5, in, out, aux)); + + /* + ASSERT_EQ(0, out[0]); + ASSERT_EQ(1, out[1]); + ASSERT_EQ(0, out[2]); + ASSERT_EQ(1, out[3]); + ASSERT_EQ(0, out[4]); + */ + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + ASSERT_EQ(0, aux[3]); + ASSERT_EQ(3, aux[4]); + + return TEST_SUCCESS; +} + +enum test_result test_stree_encode_so_example(void) +{ + const uint8_t *in = (const uint8_t *) "abcabxabcd"; + uint8_t out[10]; + size_t aux[10]; + ASSERT_EQ(0, stree_encode(10, in, out, aux)); + + /* + ASSERT_EQ(0, out[0]); + ASSERT_EQ(1, out[1]); + ASSERT_EQ(2, out[2]); + ASSERT_EQ(0, out[3]); + ASSERT_EQ(0, out[4]); + ASSERT_EQ(3, out[5]); + ASSERT_EQ(0, out[6]); + ASSERT_EQ(0, out[7]); + ASSERT_EQ(1, out[8]); + ASSERT_EQ(3, out[9]); + */ + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + ASSERT_EQ(0, aux[3]); + ASSERT_EQ(0, aux[4]); + ASSERT_EQ(0, aux[5]); + ASSERT_EQ(0, aux[6]); + ASSERT_EQ(0, aux[7]); + ASSERT_EQ(4, aux[8]); + ASSERT_EQ(4, aux[9]); + + return TEST_SUCCESS; +} + +enum test_result test_stree_tricky_suffix_link(void) +{ + const uint8_t *in = (const uint8_t *) "cdddcdc"; + uint8_t out[7]; + size_t aux[7]; + ASSERT_EQ(0, stree_encode(7, in, out, aux)); + + /* + ASSERT_EQ(2, out[0]); + ASSERT_EQ(3, out[1]); + ASSERT_EQ(1, out[2]); + ASSERT_EQ(0, out[3]); + ASSERT_EQ(1, out[4]); + ASSERT_EQ(0, out[5]); + ASSERT_EQ(1, out[6]); + */ + + ASSERT_EQ(0, aux[0]); + ASSERT_EQ(0, aux[1]); + ASSERT_EQ(0, aux[2]); + ASSERT_EQ(0, aux[3]); + ASSERT_EQ(0, aux[4]); + ASSERT_EQ(0, aux[5]); + ASSERT_EQ(0, aux[6]); + + return TEST_SUCCESS; +} + +enum test_result test_stree_minimal(void) +{ + const uint8_t *in = (const uint8_t *) "abcdeabacacabb"; + uint8_t out[14]; + size_t aux[14]; + ASSERT_EQ(0, stree_encode(14, in, out, aux)); + + return TEST_SUCCESS; +} + +enum test_result test_stree_minimal_2(void) +{ + const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccacc"; + uint8_t out[24]; + size_t aux[24]; + ASSERT_EQ(0, stree_encode(24, in, out, aux)); + + return TEST_SUCCESS; +} + +enum test_result test_stree_minimal_3(void) +{ + const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccaccccaa"; + uint8_t out[28]; + size_t aux[28]; + ASSERT_EQ(0, stree_encode(28, in, out, aux)); + + return TEST_SUCCESS; +} + +enum test_result test_stree_long(void) +{ + const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccaccccaabbbaababdadbaccabbdadbadbabaccacbbbcbadddbdababddddddabdabddddddabbbbccc"; + uint8_t out[100]; + size_t aux[100]; + ASSERT_EQ(0, stree_encode(100, in, out, aux)); + + return TEST_SUCCESS; +} + +enum test_result test_stree_very_long(void) +{ + const uint8_t *in = (const uint8_t *) "abcdeabacacabbabcdcccaccccaabbbaababdadbaccabbdadbadbabaccacbbbcbadddbdababddddddabdabddddddabbbbccc"; + uint8_t out[100]; + size_t aux[100]; + ASSERT_EQ(0, stree_encode(100, in, out, aux)); + + return TEST_SUCCESS; +} + +int main(void) +{ + RUN_TEST(test_stree_encode_empty); + RUN_TEST(test_stree_encode_simple); + RUN_TEST(test_stree_encode_nontrivial); + RUN_TEST(test_stree_encode_so_example); + RUN_TEST(test_stree_tricky_suffix_link); + RUN_TEST(test_stree_minimal); + RUN_TEST(test_stree_minimal_2); + RUN_TEST(test_stree_minimal_3); + RUN_TEST(test_stree_long); + RUN_TEST(test_stree_very_long); +} -- 2.34.1