diff options
| -rw-r--r-- | Makefile | 11 | ||||
| -rw-r--r-- | include/hex_conversion.h | 2 | ||||
| -rw-r--r-- | src/tree_based_check.c | 684 |
3 files changed, 694 insertions, 3 deletions
@@ -4,13 +4,13 @@ CFLAGS += -Wall -Wextra -Wformat -Wformat=2 -Wconversion -Wimplicit-fallthrough -Wbidi-chars=any -Werror=implicit -Werror=incompatible-pointer-types -Werror=int-conversion \ -fno-delete-null-pointer-checks -fstack-clash-protection -fstack-protector-strong -fstrict-flex-arrays=3 -DEBUG_CFLAGS = -DDEBUGBUILD -g -Werror -fsanitize=address +DEBUG_CFLAGS = -DDEBUGBUILD -g -Werror -fsanitize=address -O0 PROD_CFLAGS = -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -FPIE -pie -Wl,-z,noexecstack -Wl,--as-needed -Wl,--no-copy-dt-needed-entries CFLAGS += -I include -all: out/split_for_sort +all: out/split_for_sort out/tree_based_check @@ -32,4 +32,11 @@ out/split_for_sort: out src/split_for_sort.c include/output.h out/debug/split_for_sort_debug: out/debug src/split_for_sort.c include/output.h ${CC} -o $@ ${CFLAGS} ${DEBUG_CFLAGS} src/split_for_sort.c +out/tree_based_check: out src/tree_based_check.c \ + include/output.h include/hex_conversion.h + ${CC} -o $@ ${CFLAGS} ${PROD_CFLAGS} src/tree_based_check.c + +out/debug/tree_based_check: out/debug src/tree_based_check.c \ + include/output.h include/hex_conversion.h + ${CC} -o $@ ${CFLAGS} ${DEBUG_CFLAGS} src/tree_based_check.c diff --git a/include/hex_conversion.h b/include/hex_conversion.h index b31216d..4b97d6d 100644 --- a/include/hex_conversion.h +++ b/include/hex_conversion.h @@ -26,7 +26,7 @@ inline int convert_line(char *s) { return -1; l=strlen(s); for (i=0; i<l; i++) { - s[i] = toupper(s[i]); + s[i] = (char)toupper(s[i]); if ((s[i] == '\r' && (l-i<3)) || (s[i] == '\n' && i==l-1)) { s[i] = '\0'; break; diff --git a/src/tree_based_check.c b/src/tree_based_check.c new file mode 100644 index 0000000..abf1232 --- /dev/null +++ b/src/tree_based_check.c @@ -0,0 +1,684 @@ +/* + * Compare two files with a single SHA1 hash per line + * useful when comparing filesystem content: + * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt + * awk ... + * + * Hashes are converted into binary form and stored in lists in a tree. + * vim:ts=4:sw=4:expandtab + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdbool.h> +#include <stdint.h> +#include <errno.h> +#include <ctype.h> +#include <inttypes.h> + +#include "output.h" +#include "hex_conversion.h" + + +#ifndef LINELENGTH +#define LINELENGTH 1024 +#endif + +/* 8 bytes pointer to a smaller char array? As there are strict conditions in + * regards to the data which will be processed work with fix size arrays and + * do math hard coded at compile time. */ +#define NON_CONVERTED_SIZE 40 +#define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) +/* The tree can easily cause an immense memory overhead */ +#define TREE_MAX_DEPTH 4 +#define TREE_BRANCHES 32 + +struct list_node { + struct list_node *next; + char data[FULL_DATABLOCK_SIZE]; +}; + +struct list_head { + struct list_node *first_node; + struct list_node *last_node; +#ifdef DEBUGBUILD + size_t head_nr; + size_t length; +#endif +}; + +struct tree_branch { + struct tree_branch *branches[TREE_BRANCHES]; + struct list_head *list; +}; + +struct tree_root { + struct tree_branch *branches[256]; +}; + +#ifdef DEBUGBUILD +size_t head_numbers = 0; +size_t freed = 0; +#endif + +struct list_head *create_list_head(); +void destroy_list(struct list_head *head); +int insert_data_into_list(struct list_head *head, char *data); +int remove_data_from_list(struct list_head *head, char *data); +int check_list_for_data(struct list_head *head, char *data); +struct tree_root *create_tree_root(); +void destroy_tree_branches(struct tree_branch *branch); +void destroy_tree(struct tree_root *root); +int insert_into_tree(struct tree_root *root, char *data); +int check_tree_for_data(struct tree_root *root, char *data); +void print_list_plain(struct list_head *head, FILE *fd); +void print_tree_branch(struct tree_branch *branch, FILE *fd); +void print_tree(struct tree_root *root, FILE *fd); + + + +inline struct list_head *create_list_head() { + struct list_head *head = calloc(1, sizeof(struct list_head)); + if (head == NULL) { + /* If the following masked fprintf calls fail there was a chance */ + LOGERR("ERROR: Failed to allocate memory for list_head: "); + fputs(strerror(errno), stderr); + fputc('\n', stderr); + return NULL; + } + head->first_node = NULL; + head->last_node = NULL; +#ifdef DEBUGBUILD + head->head_nr = head_numbers++; + LOGERR("DEBUG: Create head_nr %lu\n", head->head_nr); + head->length = 0; +#endif + return head; +} + +inline void destroy_list(struct list_head *head) { + struct list_node *ptr=NULL; + if (head == NULL) return; +#ifdef DEBUGBUILD + char *out = calloc(42,sizeof(char)); + fprintf(stderr, "\n\n\n"); + LOGERR("DEBUG: freed so far %lu\n", freed++); +//if (head->head_nr == 15) { + LOGERR("DEBUG: head 0x%p\n", head); + LOGERR("DEBUG: first_node %p\n", head->first_node); + LOGERR("DEBUG: last_node %p\n", head->last_node); + LOGERR("DEBUG: head_nr %lu\n", head->head_nr); + LOGERR("DEBUG: length %lu\n", head->length); +//} +#endif + while (head->first_node != NULL) { + ptr = head->first_node; +#ifdef DEBUGBUILD +//if (head->head_nr == 15) { + LOGERR("DEBUG: first_node %p\n", head->first_node); + LOGERR("ptr %p\n", ptr); + LOGERR("ptr->next %p\n", ptr->next); + out = convert_from_binary(ptr->data, 20, out); + LOGERR("data: %p (converted plaintext value: '%s'\n", ptr->data, out); +//} +#endif + head->first_node = head->first_node->next; + free(ptr); + } + free(head); + head = NULL; + +#ifdef DEBUGBUILD + free(out); +#endif +} + +/* No time consuming duplicate check, data to be processed is expected to be gone sorted through uniq */ +inline int insert_data_into_list(struct list_head *head, char *data) { + struct list_node *node; + if (head == NULL || data == NULL) { + LOGERR("ERROR: Improper arguments.\n"); + return -1; + } + if ((node = calloc(1, sizeof(struct list_node))) == NULL) { + LOGERR("ERROR: Failed to allocate %lu bytes memory for list node: %s\n", sizeof(struct list_node), strerror(errno)); + return -2; + } + memcpy(node->data, data, FULL_DATABLOCK_SIZE); + node->next = NULL; +#ifdef DEBUGBUILD + LOGERR("DEBUG: node %p / node->next %p (nil expected)\n", node, node->next); +#endif + do { + if (head->first_node == NULL) { + head->first_node = node; + head->last_node = node; +#ifdef DEBUGBUILD + LOGERR("DEBUG: head %p (nr %lu) / head->first_node %p / head->last_node %p\n", head, head->head_nr, head->first_node, head->last_node); +#endif + break; + } + head->last_node->next = node; + head->last_node = node; +#ifdef DEBUGBUILD + LOGERR("DEBUG: head %p (nr %lu) / head->last_node %p / node %p\n", head, head->head_nr, head->last_node, node); +#endif + } while (0); +#ifdef DEBUGBUILD + head->length++; +#endif + return 0; +} + + +inline int remove_data_from_list(struct list_head *head, char *data) { + struct list_node *node, *prev; + if (head == NULL || data == NULL) { + LOGERR("ERROR: Improper arguments.\n"); + return -1; + } + node = head->first_node; + if (node == NULL) { + return 1; + } + /* Check first node in list */ + if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { + head->first_node = node->next; + free(node); +#ifdef DEBUGBUILD + head->length--; +#endif + return 0; + } + + prev = head->first_node; + node = head->first_node->next; + + /**/ + while (node != NULL) { + if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { + prev->next = node->next; + free(node); +#ifdef DEBUGBUILD + head->length--; +#endif + return 0; + } + prev = node; + node = node->next; + } + + return 1; +} + + +/* Thread safety mandatory in case mechanisms to remove found entries shall be + * removed to safe time with future checks. However, in case the input file is + * sorted, Thread safety has negative performance impact. + */ +inline int check_list_for_data(struct list_head *head, char *data) { + struct list_node *node; + int rc = 1; + if (head == NULL || data == NULL) { + LOGERR("ERROR: Improper arguments.\n"); + return -1; + } + + node = head->first_node; + while(node != NULL) { + if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { + rc = 0; + break; + } + node = node->next; + } + + return rc; +} + + +inline struct tree_root *create_tree_root() { + struct tree_root *root = calloc(1, sizeof(struct tree_root)); + if (root == NULL) { + LOGERR("ERROR: Failed to allocate memory for tree_root"); + fputs(strerror(errno),stderr); + fputc('\n', stderr); + return NULL; + } + return root; +} + +inline void destroy_tree_branches(struct tree_branch *branch) { + uint16_t i = 0; + if (branch == NULL) + return; + if (branch->list != NULL) { + destroy_list(branch->list); + } + branch->list = NULL; + for (i=0; i<TREE_BRANCHES; i++) { + destroy_tree_branches(branch->branches[i]); + branch->branches[i] = NULL; + } +} + +inline void destroy_tree(struct tree_root *root) { + uint16_t i; + + for (i=0; i<256; i++) { + if (root->branches[i] != NULL) { + destroy_tree_branches(root->branches[i]); + root->branches[i] = NULL; + } + } + free(root); +} + +inline int insert_into_tree(struct tree_root *root, char *data) { + int depth = 0, i =0; + struct tree_branch *ptr = NULL; + unsigned char uc = 0; + + if (root == NULL || data == NULL) { + LOGERR("ERROR: Empty argument\n"); + return -1; + } + + /* tree_root always covers 256 bytes/branches, in order to improve search + * performance in case the data is sorted use the last byte in this step + * for parallel searches, less wait-time due to locks. */ + uc = (unsigned char)data[FULL_DATABLOCK_SIZE-1]; + ptr = root->branches[uc]; + if (ptr == NULL) { + if ((root->branches[uc] = calloc(1, sizeof(struct tree_branch))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for new tree branch %02X for the root\n", uc); + return -2; + } + ptr = root->branches[uc]; + } + for (depth = 0; depth < TREE_MAX_DEPTH-1; depth++) { + uc = ((unsigned char)data[depth]) % TREE_BRANCHES; + if (ptr->branches[uc] == NULL) { + if ((ptr->branches[uc]=calloc(1,sizeof(struct tree_branch))) == NULL) { + LOGERR("ERROR: Failed to allocate memory for new tree branch %02X at depth %d\n", uc, depth); + return -3; + } + for (i=0;i<TREE_BRANCHES;i++) { ptr->branches[uc]->branches[i] = NULL; } + ptr->branches[uc]->list = NULL; + } + ptr = ptr->branches[uc]; + } + + if (ptr->list == NULL) { + if ((ptr->list = create_list_head()) == NULL) { + return -4; + } + } + + return insert_data_into_list(ptr->list, data); +} + + + +/* Return values: + * < 0 ERROR | == 0 FOUND | > 0 NOT FOUND */ +inline int check_tree_for_data(struct tree_root *root, char *data) { + int depth = 0; + struct tree_branch *ptr; + unsigned char uc; + if (root == NULL || data == NULL) { + LOGERR("ERROR: Empty argument.\n"); + return -1; + } + uc = (unsigned char)data[FULL_DATABLOCK_SIZE-1]; + ptr = root->branches[uc]; + if (ptr == NULL) return 1; + + for (depth=0; depth < TREE_MAX_DEPTH-1; depth++) { + uc = ((unsigned char)data[depth]) % TREE_BRANCHES; + if (ptr->branches[uc] == NULL) + return 1; + ptr = ptr->branches[uc]; + } + return check_list_for_data(ptr->list, data); +} + + +void print_list_plain(struct list_head *head, FILE *fd) { + struct list_node *ptr = NULL; + char *out = NULL; + + if (head == NULL) return; + + if (fd == NULL) { + fd = stdout; + } + + ptr = head->first_node; + while (ptr != NULL) { + out = convert_from_binary(ptr->data, FULL_DATABLOCK_SIZE, out); + fputs(out, fd); + fputc('\n', fd); + ptr = ptr->next; + } +} + + +void print_tree_branch(struct tree_branch *branch, FILE *fd) { + uint16_t i; + if (branch == NULL) return; + if (fd == NULL) fd = stdout; + if (branch->list != NULL) + print_list_plain(branch->list, fd); + for (i=0; i<TREE_BRANCHES; i++) { + print_tree_branch(branch->branches[i], fd); + } +} + +void print_tree(struct tree_root *root, FILE *fd) { + uint16_t i; + if (root == NULL) { + return; + } + for (i=0; i<256;i++) { + print_tree_branch(root->branches[i], fd); + } +} + + +struct tree_root *read_file_into_tree(char *filename) { + size_t line_nr=0; + struct tree_root *root = NULL; + FILE *fd = NULL; + char *line = NULL, *data = NULL; + + if (filename == NULL || strlen(filename) == 0) { + return NULL; + } + + if ((root=create_tree_root()) == NULL) { + return NULL; + } + + if ((line=calloc(LINELENGTH,sizeof(char))) == NULL) { + free(root); + LOGERR("ERROR: Failed to allocate %d bytes for line buffer\n", LINELENGTH); + return NULL; + } + + if ((data=calloc(LINELENGTH/2,sizeof(char))) == NULL) { + free(root); + free(line); + LOGERR("ERROR: Failed to allocate %d bytes for data buffer\n", LINELENGTH/2); + return NULL; + } + + if ((fd=fopen(filename, "r")) == NULL) { + free(root); + free(line); + free(data); + LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); + return NULL; + } + + while (fgets(line, LINELENGTH, fd) != NULL) { + line_nr++; + if (convert_line(line)<0) { + LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); + free(data); + free(line); + destroy_tree(root); + fclose(fd); + return NULL; + } + + if (convert_to_binary(line, data) == NULL) { + free(data); + free(line); + destroy_tree(root); + fclose(fd); + LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); + return NULL; + } + + if (insert_into_tree(root, data) != 0) { + free(data); + free(line); + destroy_tree(root); + fclose(fd); + LOGERR("ERROR: Failed to insert line %lu in binary form into tree.\n", line_nr); + return NULL; + } + } + free(data); + free(line); + fclose(fd); + return root; +} + + +int filter_file_with_tree(char *filename, FILE *output, struct tree_root *root) { + size_t line_nr=0; + FILE *fd = NULL; + char *line = NULL, *data = NULL; + int check_result = 0; + int rc = 0; + + if (filename == NULL || root == NULL|| strlen(filename) == 0) { + return -1; + } + + if ((line=calloc(LINELENGTH,sizeof(char))) == NULL) { + LOGERR("ERROR: Failed to allocate %d bytes for line buffer\n", LINELENGTH); + return -2; + } + + if ((data=calloc(LINELENGTH/2,sizeof(char))) == NULL) { + free(line); + LOGERR("ERROR: Failed to allocate %d bytes for data buffer\n", LINELENGTH/2); + return -3; + } + + if ((fd=fopen(filename, "r")) == NULL) { + free(line); + free(data); + LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); + return -4; + } + + while (fgets(line, LINELENGTH, fd) != NULL) { + line_nr++; + if (convert_line(line)<0) { + LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); + rc++; + continue; + } + + if (convert_to_binary(line, data) == NULL) { + LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); + rc++; + continue; + } + + check_result = check_tree_for_data(root, data); + if (check_result > 0) { /* Not found => print */ + fputs(line, output); + fputc('\n', output); + } else if (check_result < 0) { + LOGERR("ERROR: check function called with invalid arguments at line %lu\n", line_nr); + rc++; + continue; + } + } + free(data); + free(line); + fclose(fd); + + return rc; +} + + +#ifdef DEBUGBUILD +int test_list() { + char *content[20] = { + "2bc4b40c334a098593dc80d87213862d64ca3651", + "cc20639230abd1fb3fab4a6416b9587ab8cc62b4", + "dbb2d4efda208d7004dc9da3dc73e8b428caa4a8", + "7ce204f222956171dfcd271d6742f337b3be8711", + "098323d451251c7794829790d40104fccb7cc40f", + "f0d00ea6c1f155a95bd857faad3502ef1634bc4a", + "8ced992112e8bd26ead5c5891cc6641bf058a689", + "2049ffda6ff1cc5daf80bf11e22c2a4731ab7638", + "f40d8c300351ef8623b365652b0c3e49208e2a08", + "af5d07d66a9f502c6541b01387daa0a07376c217", + "2bcdcc73d8f7c8d905b6c9c12ac76ebaed7b6b1c", + "d6162568f67beddf9322f9f7ac8da2a7913a1cc6", + "e432a240228a839fa63d600174733c63572f73ab", + "0b2a5c0f8ef6ba73599e2d5feefd72741cdf8c98", + "f60619a4ab9a43149b314fc5cbd74edbc7159cce", + "08416eabffe9fb7a24629ad88b6a519180cd81c0", + "00031125caea9639a51997e9aa59f5bde49e0417", + "704163e6519e928d2fc261a515576f556973391c", + "9485eaf154c185617c2b04ab8d359011ec0c528a", + "f02ac509cbe3e7634b51030c9058164a36f80815" + }; + struct list_head *lh = NULL; + int i = 0, rc = 0; + char *data = NULL; + + LOGERR("=== BEGIN LIST TEST ===\n\n"); + + data=calloc(LINELENGTH/2, sizeof(char)); + + lh = create_list_head(); + if (lh == NULL) { + LOGERR("FAILURE: Failed to intialize list head\n"); + rc++; + return rc; + } + for (i=0; i<20; i++) { + convert_to_binary(content[i],data); + if (insert_data_into_list(lh, data, 0) != 0) { + LOGERR("FAILURE: Failed to insert '%s' in binary form into list.\n", content[i]); + rc++; + } + } + + LOGERR("=> Print full list\n"); + print_list_plain(lh, NULL, 0); + + destroy_list(lh); + LOGERR("\n=== FINISHED LIST TEST ===\n\n"); + return rc; +} + + +int test_tree() { + char *content[20] = { + "2bc4b40c334a098593dc80d87213862d64ca3651", + "cc20639230abd1fb3fab4a6416b9587ab8cc62b4", + "dbb2d4efda208d7004dc9da3dc73e8b428caa4a8", + "7ce204f222956171dfcd271d6742f337b3be8711", + "098323d451251c7794829790d40104fccb7cc40f", + "f0d00ea6c1f155a95bd857faad3502ef1634bc4a", + "8ced992112e8bd26ead5c5891cc6641bf058a689", + "2049ffda6ff1cc5daf80bf11e22c2a4731ab7638", + "f40d8c300351ef8623b365652b0c3e49208e2a08", + "af5d07d66a9f502c6541b01387daa0a07376c217", + "2bcdcc73d8f7c8d905b6c9c12ac76ebaed7b6b1c", + "d6162568f67beddf9322f9f7ac8da2a7913a1cc6", + "e432a240228a839fa63d600174733c63572f73ab", + "0b2a5c0f8ef6ba73599e2d5feefd72741cdf8c98", + "f60619a4ab9a43149b314fc5cbd74edbc7159cce", + "08416eabffe9fb7a24629ad88b6a519180cd81c0", + "00031125caea9639a51997e9aa59f5bde49e0417", + "704163e6519e928d2fc261a515576f556973391c", + "9485eaf154c185617c2b04ab8d359011ec0c528a", + "f02ac509cbe3e7634b51030c9058164a36f80815" + }; + struct tree_root *root = NULL; + int i = 0, rc = 0; + char *data = NULL; + + LOGERR("=== BEGIN TREE TEST ===\n\n"); + + data=calloc(LINELENGTH/2, sizeof(char)); + + root = create_tree_root(); + if (root == NULL) { + LOGERR("FAILURE: Failed to intialize tree root\n"); + rc++; + return rc; + } + for (i=0; i<20; i++) { + convert_to_binary(content[i],data); + if (insert_into_tree(root, data, 0) != 0) { + LOGERR("FAILURE: Failed to insert '%s' in binary form into list.\n", content[i]); + rc++; + } + } + + print_tree(root, NULL, 0); + + destroy_tree(root); + LOGERR("\n=== FINISHED TREE TEST ===\n\n"); + return rc; +} +#endif + + +int main(int argc, char **argv) { + FILE *output = NULL; + struct tree_root *root = NULL; + int filter_rc = 0; + +#ifdef DEBUGBUILD + if (test_list() != 0) { + LOGERR("ABORT OTHER TESTS.\n"); + return EXIT_FAILURE; + } + if (test_tree() != 0) { + LOGERR("ABORT OTHER TESTS.\n"); + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +#endif + + if (argc != 4) { + fprintf(stderr, "Usage: %s raw_data filter_file output_file\n", argv[0]); + fprintf(stderr, "\toutput_file - if set to 'stdout' the stream is used\n"); + fprintf(stderr, "\t - appends to file, not overwrite\n"); + return EXIT_FAILURE; + } + + fprintf(stderr, "raw data %s\n", argv[1]); + fprintf(stderr, "filter %s\n", argv[2]); + fprintf(stderr, "output %s\n", argv[3]); + + + if (strncmp(argv[3], "stdout", 7) == 0) { + output = stdout; + } else { + if ((output = fopen(argv[3], "a")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s\n", argv[3], strerror(errno)); + return EXIT_FAILURE; + } + } + + fprintf(stderr, "Read file '%s' into memory\n", argv[2]); + root = read_file_into_tree(argv[2]); + fprintf(stderr, "Begin filtering file '%s'\n", argv[1]); + filter_rc = filter_file_with_tree(argv[1], output, root); + fflush(output); + fclose(output); + fprintf(stderr, "Filtering finished, cleaning up.\n"); + destroy_tree(root); + + if (filter_rc != 0) { + LOGERR("problems during application of filter, check stderr for error messages\n"); + } + + return filter_rc; +} + |
