/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * Compare two files with a single SHA1 hash per line * useful when comparing filesystem content: * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt * awk ... * * Hashes are converted into binary form and stored in lists in a tree. * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include #include "trace_macros.h" #include "hex_conversion.h" #ifndef LINELENGTH #define LINELENGTH 1024 #endif /* 8 bytes pointer to a smaller char array? As there are strict conditions in * regards to the data which will be processed work with fix size arrays and * do math hard coded at compile time. */ #define NON_CONVERTED_SIZE 40 #define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) /* The tree can easily cause an immense memory overhead */ #define TREE_MAX_DEPTH 4 #define TREE_BRANCHES 32 struct list_node { struct list_node *next; unsigned char data[FULL_DATABLOCK_SIZE]; }; struct list_head { struct list_node *first_node; struct list_node *last_node; #ifdef DEBUGBUILD size_t head_nr; size_t length; #endif }; struct tree_branch { struct tree_branch *branches[TREE_BRANCHES]; struct list_head *list; }; struct tree_root { struct tree_branch *branches[256]; }; #ifdef DEBUGBUILD size_t head_numbers = 0; size_t freed = 0; #endif struct list_head *create_list_head(); void destroy_list(struct list_head *head); int insert_data_into_list(struct list_head *head, unsigned char *data); int remove_data_from_list(struct list_head *head, char *data); int check_list_for_data(struct list_head *head, unsigned char *data); struct tree_root *create_tree_root(); void destroy_tree_branches(struct tree_branch *branch); void destroy_tree(struct tree_root *root); int insert_into_tree(struct tree_root *root, unsigned char *data); int check_tree_for_data(struct tree_root *root, unsigned char *data); void print_list_plain(struct list_head *head, FILE *fd); void print_tree_branch(struct tree_branch *branch, FILE *fd); void print_tree(struct tree_root *root, FILE *fd); inline struct list_head *create_list_head() { struct list_head *head = calloc(1, sizeof(struct list_head)); if (head == NULL) { /* If the following masked fprintf calls fail there was a chance */ LOGERR("ERROR: Failed to allocate memory for list_head: "); fputs(strerror(errno), stderr); fputc('\n', stderr); return NULL; } head->first_node = NULL; head->last_node = NULL; #ifdef DEBUGBUILD head->head_nr = head_numbers++; LOGERR("DEBUG: Create head_nr %lu\n", head->head_nr); head->length = 0; #endif return head; } inline void destroy_list(struct list_head *head) { struct list_node *ptr=NULL; if (head == NULL) return; #ifdef DEBUGBUILD char *out = calloc(42,sizeof(char)); fprintf(stderr, "\n\n\n"); LOGERR("DEBUG: freed so far %lu\n", freed++); //if (head->head_nr == 15) { LOGERR("DEBUG: head 0x%p\n", head); LOGERR("DEBUG: first_node %p\n", head->first_node); LOGERR("DEBUG: last_node %p\n", head->last_node); LOGERR("DEBUG: head_nr %lu\n", head->head_nr); LOGERR("DEBUG: length %lu\n", head->length); //} #endif while (head->first_node != NULL) { ptr = head->first_node; #ifdef DEBUGBUILD //if (head->head_nr == 15) { LOGERR("DEBUG: first_node %p\n", head->first_node); LOGERR("ptr %p\n", ptr); LOGERR("ptr->next %p\n", ptr->next); out = convert_from_binary(ptr->data, 20, out); LOGERR("data: %p (converted plaintext value: '%s'\n", ptr->data, out); //} #endif head->first_node = head->first_node->next; free(ptr); } free(head); head = NULL; #ifdef DEBUGBUILD free(out); #endif } /* No time consuming duplicate check, data to be processed is expected to be gone sorted through uniq */ inline int insert_data_into_list(struct list_head *head, unsigned char *data) { struct list_node *node; if (head == NULL || data == NULL) { LOGERR("ERROR: Improper arguments.\n"); return -1; } if ((node = calloc(1, sizeof(struct list_node))) == NULL) { LOGERR("ERROR: Failed to allocate %lu bytes memory for list node: %s\n", sizeof(struct list_node), strerror(errno)); return -2; } memcpy(node->data, data, FULL_DATABLOCK_SIZE); node->next = NULL; #ifdef DEBUGBUILD LOGERR("DEBUG: node %p / node->next %p (nil expected)\n", node, node->next); #endif do { if (head->first_node == NULL) { head->first_node = node; head->last_node = node; #ifdef DEBUGBUILD LOGERR("DEBUG: head %p (nr %lu) / head->first_node %p / head->last_node %p\n", head, head->head_nr, head->first_node, head->last_node); #endif break; } head->last_node->next = node; head->last_node = node; #ifdef DEBUGBUILD LOGERR("DEBUG: head %p (nr %lu) / head->last_node %p / node %p\n", head, head->head_nr, head->last_node, node); #endif } while (0); #ifdef DEBUGBUILD head->length++; #endif return 0; } inline int remove_data_from_list(struct list_head *head, char *data) { struct list_node *node, *prev; if (head == NULL || data == NULL) { LOGERR("ERROR: Improper arguments.\n"); return -1; } node = head->first_node; if (node == NULL) { return 1; } /* Check first node in list */ if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { head->first_node = node->next; free(node); #ifdef DEBUGBUILD head->length--; #endif return 0; } prev = head->first_node; node = head->first_node->next; /**/ while (node != NULL) { if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { prev->next = node->next; free(node); #ifdef DEBUGBUILD head->length--; #endif return 0; } prev = node; node = node->next; } return 1; } /* Thread safety mandatory in case mechanisms to remove found entries shall be * removed to safe time with future checks. However, in case the input file is * sorted, Thread safety has negative performance impact. */ inline int check_list_for_data(struct list_head *head, unsigned char *data) { struct list_node *node; int rc = 1; if (head == NULL || data == NULL) { LOGERR("ERROR: Improper arguments.\n"); return -1; } node = head->first_node; while(node != NULL) { if (memcmp(node->data, data, FULL_DATABLOCK_SIZE) == 0) { rc = 0; break; } node = node->next; } return rc; } inline struct tree_root *create_tree_root() { struct tree_root *root = calloc(1, sizeof(struct tree_root)); if (root == NULL) { LOGERR("ERROR: Failed to allocate memory for tree_root"); fputs(strerror(errno),stderr); fputc('\n', stderr); return NULL; } return root; } inline void destroy_tree_branches(struct tree_branch *branch) { uint16_t i = 0; if (branch == NULL) return; if (branch->list != NULL) { destroy_list(branch->list); } branch->list = NULL; for (i=0; ibranches[i]); branch->branches[i] = NULL; } } inline void destroy_tree(struct tree_root *root) { uint16_t i; for (i=0; i<256; i++) { if (root->branches[i] != NULL) { destroy_tree_branches(root->branches[i]); root->branches[i] = NULL; } } free(root); } inline int insert_into_tree(struct tree_root *root, unsigned char *data) { int depth = 0, i =0; struct tree_branch *ptr = NULL; unsigned char uc = 0; if (root == NULL || data == NULL) { LOGERR("ERROR: Empty argument\n"); return -1; } /* tree_root always covers 256 bytes/branches, in order to improve search * performance in case the data is sorted use the last byte in this step * for parallel searches, less wait-time due to locks. */ uc = (unsigned char)data[FULL_DATABLOCK_SIZE-1]; ptr = root->branches[uc]; if (ptr == NULL) { if ((root->branches[uc] = calloc(1, sizeof(struct tree_branch))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new tree branch %02X for the root\n", uc); return -2; } ptr = root->branches[uc]; } for (depth = 0; depth < TREE_MAX_DEPTH-1; depth++) { uc = ((unsigned char)data[depth]) % TREE_BRANCHES; if (ptr->branches[uc] == NULL) { if ((ptr->branches[uc]=calloc(1,sizeof(struct tree_branch))) == NULL) { LOGERR("ERROR: Failed to allocate memory for new tree branch %02X at depth %d\n", uc, depth); return -3; } for (i=0;ibranches[uc]->branches[i] = NULL; } ptr->branches[uc]->list = NULL; } ptr = ptr->branches[uc]; } if (ptr->list == NULL) { if ((ptr->list = create_list_head()) == NULL) { return -4; } } return insert_data_into_list(ptr->list, data); } /* Return values: * < 0 ERROR | == 0 FOUND | > 0 NOT FOUND */ inline int check_tree_for_data(struct tree_root *root, unsigned char *data) { int depth = 0; struct tree_branch *ptr; unsigned char uc; if (root == NULL || data == NULL) { LOGERR("ERROR: Empty argument.\n"); return -1; } uc = (unsigned char)data[FULL_DATABLOCK_SIZE-1]; ptr = root->branches[uc]; if (ptr == NULL) return 1; for (depth=0; depth < TREE_MAX_DEPTH-1; depth++) { uc = ((unsigned char)data[depth]) % TREE_BRANCHES; if (ptr->branches[uc] == NULL) return 1; ptr = ptr->branches[uc]; } return check_list_for_data(ptr->list, data); } void print_list_plain(struct list_head *head, FILE *fd) { struct list_node *ptr = NULL; char *out = NULL; if (head == NULL) return; if (fd == NULL) { fd = stdout; } ptr = head->first_node; while (ptr != NULL) { out = convert_from_binary(ptr->data, FULL_DATABLOCK_SIZE, out); fputs(out, fd); fputc('\n', fd); ptr = ptr->next; } } void print_tree_branch(struct tree_branch *branch, FILE *fd) { uint16_t i; if (branch == NULL) return; if (fd == NULL) fd = stdout; if (branch->list != NULL) print_list_plain(branch->list, fd); for (i=0; ibranches[i], fd); } } void print_tree(struct tree_root *root, FILE *fd) { uint16_t i; if (root == NULL) { return; } for (i=0; i<256;i++) { print_tree_branch(root->branches[i], fd); } } struct tree_root *read_file_into_tree(char *filename) { size_t line_nr=0; struct tree_root *root = NULL; FILE *fd = NULL; char *line = NULL; unsigned char *data = NULL; if (filename == NULL || strlen(filename) == 0) { return NULL; } if ((root=create_tree_root()) == NULL) { return NULL; } if ((line=calloc(LINELENGTH,sizeof(char))) == NULL) { free(root); LOGERR("ERROR: Failed to allocate %d bytes for line buffer\n", LINELENGTH); return NULL; } if ((data=calloc(LINELENGTH/2,sizeof(char))) == NULL) { free(root); free(line); LOGERR("ERROR: Failed to allocate %d bytes for data buffer\n", LINELENGTH/2); return NULL; } if ((fd=fopen(filename, "r")) == NULL) { free(root); free(line); free(data); LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); return NULL; } while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; if (convert_line(line)<0) { LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); free(data); free(line); destroy_tree(root); fclose(fd); return NULL; } if (convert_to_binary(line, data) == NULL) { free(data); free(line); destroy_tree(root); fclose(fd); LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); return NULL; } if (insert_into_tree(root, data) != 0) { free(data); free(line); destroy_tree(root); fclose(fd); LOGERR("ERROR: Failed to insert line %lu in binary form into tree.\n", line_nr); return NULL; } } free(data); free(line); fclose(fd); return root; } int filter_file_with_tree(char *filename, FILE *output, struct tree_root *root) { size_t line_nr=0; FILE *fd = NULL; char *line = NULL; unsigned char *data = NULL; int check_result = 0; int rc = 0; if (filename == NULL || root == NULL|| strlen(filename) == 0) { return -1; } if ((line=calloc(LINELENGTH,sizeof(char))) == NULL) { LOGERR("ERROR: Failed to allocate %d bytes for line buffer\n", LINELENGTH); return -2; } if ((data=calloc(LINELENGTH/2,sizeof(char))) == NULL) { free(line); LOGERR("ERROR: Failed to allocate %d bytes for data buffer\n", LINELENGTH/2); return -3; } if ((fd=fopen(filename, "r")) == NULL) { free(line); free(data); LOGERR("ERROR: Failed to open file '%s': %s\n", filename, strerror(errno)); return -4; } while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; if (convert_line(line)<0) { LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); rc++; continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); rc++; continue; } check_result = check_tree_for_data(root, data); if (check_result > 0) { /* Not found => print */ fputs(line, output); fputc('\n', output); } else if (check_result < 0) { LOGERR("ERROR: check function called with invalid arguments at line %lu\n", line_nr); rc++; continue; } } free(data); free(line); fclose(fd); return rc; } #ifdef DEBUGBUILD int test_list() { char *content[20] = { "2bc4b40c334a098593dc80d87213862d64ca3651", "cc20639230abd1fb3fab4a6416b9587ab8cc62b4", "dbb2d4efda208d7004dc9da3dc73e8b428caa4a8", "7ce204f222956171dfcd271d6742f337b3be8711", "098323d451251c7794829790d40104fccb7cc40f", "f0d00ea6c1f155a95bd857faad3502ef1634bc4a", "8ced992112e8bd26ead5c5891cc6641bf058a689", "2049ffda6ff1cc5daf80bf11e22c2a4731ab7638", "f40d8c300351ef8623b365652b0c3e49208e2a08", "af5d07d66a9f502c6541b01387daa0a07376c217", "2bcdcc73d8f7c8d905b6c9c12ac76ebaed7b6b1c", "d6162568f67beddf9322f9f7ac8da2a7913a1cc6", "e432a240228a839fa63d600174733c63572f73ab", "0b2a5c0f8ef6ba73599e2d5feefd72741cdf8c98", "f60619a4ab9a43149b314fc5cbd74edbc7159cce", "08416eabffe9fb7a24629ad88b6a519180cd81c0", "00031125caea9639a51997e9aa59f5bde49e0417", "704163e6519e928d2fc261a515576f556973391c", "9485eaf154c185617c2b04ab8d359011ec0c528a", "f02ac509cbe3e7634b51030c9058164a36f80815" }; struct list_head *lh = NULL; int i = 0, rc = 0; unsigned char *data = NULL; LOGERR("=== BEGIN LIST TEST ===\n\n"); data=calloc(LINELENGTH/2, sizeof(unsigned char)); lh = create_list_head(); if (lh == NULL) { LOGERR("FAILURE: Failed to intialize list head\n"); rc++; return rc; } for (i=0; i<20; i++) { convert_to_binary(content[i],data); if (insert_data_into_list(lh, data) != 0) { LOGERR("FAILURE: Failed to insert '%s' in binary form into list.\n", content[i]); rc++; } } LOGERR("=> Print full list\n"); print_list_plain(lh, NULL); destroy_list(lh); LOGERR("\n=== FINISHED LIST TEST ===\n\n"); return rc; } int test_tree() { char *content[20] = { "2bc4b40c334a098593dc80d87213862d64ca3651", "cc20639230abd1fb3fab4a6416b9587ab8cc62b4", "dbb2d4efda208d7004dc9da3dc73e8b428caa4a8", "7ce204f222956171dfcd271d6742f337b3be8711", "098323d451251c7794829790d40104fccb7cc40f", "f0d00ea6c1f155a95bd857faad3502ef1634bc4a", "8ced992112e8bd26ead5c5891cc6641bf058a689", "2049ffda6ff1cc5daf80bf11e22c2a4731ab7638", "f40d8c300351ef8623b365652b0c3e49208e2a08", "af5d07d66a9f502c6541b01387daa0a07376c217", "2bcdcc73d8f7c8d905b6c9c12ac76ebaed7b6b1c", "d6162568f67beddf9322f9f7ac8da2a7913a1cc6", "e432a240228a839fa63d600174733c63572f73ab", "0b2a5c0f8ef6ba73599e2d5feefd72741cdf8c98", "f60619a4ab9a43149b314fc5cbd74edbc7159cce", "08416eabffe9fb7a24629ad88b6a519180cd81c0", "00031125caea9639a51997e9aa59f5bde49e0417", "704163e6519e928d2fc261a515576f556973391c", "9485eaf154c185617c2b04ab8d359011ec0c528a", "f02ac509cbe3e7634b51030c9058164a36f80815" }; struct tree_root *root = NULL; int i = 0, rc = 0; unsigned char *data = NULL; LOGERR("=== BEGIN TREE TEST ===\n\n"); data=calloc(LINELENGTH/2, sizeof(unsigned char)); root = create_tree_root(); if (root == NULL) { LOGERR("FAILURE: Failed to intialize tree root\n"); rc++; return rc; } for (i=0; i<20; i++) { convert_to_binary(content[i],data); if (insert_into_tree(root, data) != 0) { LOGERR("FAILURE: Failed to insert '%s' in binary form into list.\n", content[i]); rc++; } } print_tree(root, NULL); destroy_tree(root); LOGERR("\n=== FINISHED TREE TEST ===\n\n"); return rc; } #endif int main(int argc, char **argv) { FILE *output = NULL; struct tree_root *root = NULL; int filter_rc = 0; #ifdef DEBUGBUILD if (test_list() != 0) { LOGERR("ABORT OTHER TESTS.\n"); return EXIT_FAILURE; } if (test_tree() != 0) { LOGERR("ABORT OTHER TESTS.\n"); return EXIT_FAILURE; } return EXIT_SUCCESS; #endif if (argc != 4) { fprintf(stderr, "Usage: %s raw_data filter_file output_file\n", argv[0]); fprintf(stderr, "\toutput_file - if set to 'stdout' the stream is used\n"); fprintf(stderr, "\t - appends to file, not overwrite\n"); return EXIT_FAILURE; } fprintf(stderr, "raw data %s\n", argv[1]); fprintf(stderr, "filter %s\n", argv[2]); fprintf(stderr, "output %s\n", argv[3]); if (strncmp(argv[3], "stdout", 7) == 0) { output = stdout; } else { if ((output = fopen(argv[3], "a")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s\n", argv[3], strerror(errno)); return EXIT_FAILURE; } } fprintf(stderr, "Read file '%s' into memory\n", argv[2]); root = read_file_into_tree(argv[2]); fprintf(stderr, "Begin filtering file '%s'\n", argv[1]); filter_rc = filter_file_with_tree(argv[1], output, root); fflush(output); fclose(output); fprintf(stderr, "Filtering finished, cleaning up.\n"); destroy_tree(root); if (filter_rc != 0) { LOGERR("problems during application of filter, check stderr for error messages\n"); } return filter_rc; }