diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/mem_internal_check.c | 675 |
1 files changed, 675 insertions, 0 deletions
diff --git a/src/mem_internal_check.c b/src/mem_internal_check.c new file mode 100644 index 0000000..a519f19 --- /dev/null +++ b/src/mem_internal_check.c @@ -0,0 +1,675 @@ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2025 Thorsten Töpper + * + * Compare two files with a single SHA1 hash per line + * useful when comparing filesystem content: + * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt + * awk ... + * + * vim:ts=4:sw=4:expandtab + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdbool.h> +#include <stdint.h> +#include <errno.h> +#include <ctype.h> +#include <inttypes.h> + +#include "output.h" +#include "hex_conversion.h" + + +#ifndef LINELENGTH +#define LINELENGTH 1024 +#endif + +/* 8 bytes pointer to a smaller char array? As there are strict conditions in + * regards to the data which will be processed work with fix size arrays and + * do math hard coded at compile time. */ +#define NON_CONVERTED_SIZE 40 +#define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) + + +struct data_field { + bool active; + unsigned char data[FULL_DATABLOCK_SIZE]; +}; + +struct list_node { + struct list_node *next; + struct data_field *field; +}; + +struct list_head { + struct list_node *first_node; + struct list_node *last_node; + size_t size; +}; + +struct list_collection { + struct list_head *heads[256]; +}; + + +struct data_array { + struct data_field **field; + size_t entry_points[256]; /* keep constant */ + size_t opt_entry_points[256]; /* increase if referred target is deactivated */ + size_t array_length; +}; + + +void apply_filter_to_array(struct data_array *array, unsigned char *data); +int compare_data_fields(const void *a, const void *b); +struct data_array *convert_collection_to_array(struct list_collection *collection); +struct list_collection *create_collection(); +struct list_head *create_list_head(); +struct list_node *create_list_node(unsigned char *data); +void destroy_collection(struct list_collection *c); +void destroy_data_array(struct data_array *da); +void destroy_list(struct list_head *head); +void fprint_array(FILE *fd, struct data_array *da); +struct list_collection *import_file_into_collection(char *filename); +bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data); +bool run_filter_file_on_array(struct data_array *da, char *filename); +bool search_data_in_array(struct data_array *da, unsigned char *data); +bool verify_data_array(struct data_array *da); +bool verify_list_collection(struct list_collection *lc); + +#ifdef DEBUGBUILD +void print_collection_state(struct list_collection *coll); +#endif + + +/* === IMPLEMENTATION === */ + +inline struct list_collection *create_collection() { + int i; + struct list_collection *c; + + if ((c=calloc(1, sizeof(struct list_collection))) == NULL) { + return NULL; + } + for (i=0; i<256; i++) { + c->heads[i] = create_list_head(); + if (c->heads[i] == NULL) { break; } + } + DBGTRC("DEBUG: %d lists initialized%s\n", i, + (i<256)?", aborting as not all 256 where" : ""); + return c; +} + + +inline struct list_head *create_list_head() { + struct list_head *head = calloc(1, sizeof(struct list_head)); + if (head == NULL) { + /* If the following masked fprintf calls fail there was a chance */ + LOGERR("ERROR: Failed to allocate memory for list_head: "); + fputs(strerror(errno), stderr); + fputc('\n', stderr); + return NULL; + } + head->first_node = NULL; + head->last_node = NULL; + head->size = 0; + return head; +} + +/* bsearch and similar */ +int compare_data_fields(const void *a, const void *b) { + return memcmp(((struct data_field*)a)->data, + ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); +} + + +inline bool search_data_in_array(struct data_array *array, unsigned char *data) { + struct data_field *df, tmp_df; + memcpy(tmp_df.data, data, FULL_DATABLOCK_SIZE); + /* Currently not working */ + df = bsearch(&tmp_df, array->field, + array->array_length, sizeof(struct data_field), + compare_data_fields); + if (df != NULL) { + return true; + } + return false; +} + + +inline void destroy_collection(struct list_collection *c) { + int i; + if (c == NULL) return; + for (i=0; i<256; i++) { + destroy_list(c->heads[i]); + } + free(c); +} + + +inline void destroy_list(struct list_head *head) { + struct list_node *ptr=NULL; + if (head == NULL) return; + while (head->first_node != NULL) { + ptr = head->first_node; + head->first_node = head->first_node->next; + if (ptr->field != NULL) + free(ptr->field); + free(ptr); + } + free(head); + head = NULL; +} + +inline void destroy_data_array(struct data_array *da) { + size_t i=0; + if (da == NULL) return; + for (;i<da->array_length; i++) { + if (da->field[i] != NULL) { + free(da->field[i]); + } + } + free(da->field); + free(da); +} + +inline struct list_node *create_list_node(unsigned char *data) { + struct list_node *node; + if (data == NULL) return NULL; + if ((node=calloc(1,sizeof(struct list_node))) == NULL) { + LOGERR("ERROR: Failed to allocate a few bytes for new list_node\n"); + return NULL; + } + if ((node->field=calloc(1,sizeof(struct data_field))) == NULL) { + LOGERR("ERROR: Failed to allocate a few bytes for new data_field\n"); + free(node); + return NULL; + } + node->next = NULL; + node->field->active = true; + memcpy(node->field->data, data, FULL_DATABLOCK_SIZE); + return node; +} + + +/* If the input data's already sorted best performance, worst case (O^n)/256 if it's reverse sorted. */ +inline bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data) { + struct list_node *node_ptr = NULL, *tmp_node = NULL; + struct list_head *head_ptr = NULL; + int cmp = 0; +#ifdef DEBUGBUILD + static size_t fcall_count = 0; +#endif + + if (collection == NULL || data == NULL) { + DBGTRC("ERROR: collection set%s / data set%s\n", + ((collection==NULL) ? " NULL" : ""), + ((data==NULL) ? " NULL" : "")); + return false; + } + +#ifdef DEBUGBUILD + DBGTRC("DEBUG: fcall_count = %lu / data[0]=0x%02X / decimal: data[0]=%3u\n", + ++fcall_count, data[0], (unsigned int)data[0]); +#endif + + head_ptr = collection->heads[data[0]]; + if (head_ptr->first_node == NULL) { + DBGTRC("DEBUG: First node for list %d\n", data[0]); + if ((tmp_node = create_list_node(data)) == NULL) { + return false; + } + head_ptr->first_node = tmp_node; + head_ptr->last_node = tmp_node; + head_ptr->size = 1; + return true; + } + + cmp = memcmp(data, head_ptr->last_node->field->data, FULL_DATABLOCK_SIZE); + if (cmp == 0) { + /* duplicate, skip */ + return true; + } + + /* Append to list */ + if (cmp > 0) { + if ((tmp_node = create_list_node(data)) == NULL) { + return false; + } + head_ptr->last_node->next = tmp_node; + head_ptr->last_node = tmp_node; + head_ptr->size++; + return true; + } + + node_ptr = head_ptr->first_node; + cmp = memcmp(data, node_ptr->field->data, FULL_DATABLOCK_SIZE); + if (cmp == 0) { return true; } + /* First to enter? Prepend to list*/ + if (cmp < 0) { + if ((tmp_node = create_list_node(data)) == NULL) { + return false; + } + tmp_node->next = head_ptr->first_node; + head_ptr->first_node = tmp_node; + head_ptr->size++; + return true; + } + + /* Sloooow part... */ + while (node_ptr->next != NULL) { + /* check next in list */ + cmp = memcmp(data, node_ptr->next->field->data, FULL_DATABLOCK_SIZE); + if (cmp == 0) { return true; } + /* insert by prepend */ + if (cmp > 0) { + if ((tmp_node = create_list_node(data)) == NULL) { + return false; + } + tmp_node->next = node_ptr->next; + node_ptr->next = tmp_node; + head_ptr->size++; + return true; + } + /* continue */ + node_ptr = node_ptr->next; + } + LOGERR("ERROR: Could not insert dataset into list, missed possible situation...\n"); + return false; +} + + +inline struct data_array *convert_collection_to_array(struct list_collection *collection) { + int i; + size_t s; + struct data_array *da = NULL; + struct list_node *ptr, *tmp; + + if (collection == NULL) { + LOGERR("ERROR: No collection given.\n"); + return NULL; + } + + s = 0; + for (i=0; i<256; i++) { +#ifdef DEBUGBUILD + if (collection->heads[i]->size > 0) { + DBGTRC("DEBUG: collections->heads[%d]->size = %lu\n", + i, collection->heads[i]->size); + } +#endif + s += collection->heads[i]->size; + DBGTRC("DEBUG: collection->heads[%d]->size = %lu\n", i, collection->heads[i]->size); + } + DBGTRC("DEBUG: Total size %lu\n", s); + + if (s == 0) { + LOGERR("ERROR: Empty collection."); + return NULL; + } + + if ((da = calloc(1, sizeof(struct data_array))) == NULL) { + LOGERR("ERROR: Failed to allocate %lu bytes...\n", sizeof(struct data_array)); + return NULL; + } + + da->array_length = s; + if ((da->field=calloc(da->array_length, sizeof(struct data_field*))) == NULL) { + free(da); + LOGERR("ERROR: Failed to allocate memory for metadata\n"); + return NULL; + } + + s = 0; + for (i=0; i<256; i++) { + DBGTRC("DEBUG: i=%d / s=%lu\n", i, s); + ptr = collection->heads[i]->first_node; + da->entry_points[i] = s; + da->opt_entry_points[i] = s; +#ifdef DEBUGBUILD + if (ptr != NULL) { + DBGTRC("DEBUG: list %d in collection, claims size %lu\n", i, + collection->heads[i]->size); + } +#endif + while (ptr != NULL) { + DBGTRC("DEBUG: Setting field %lu to %s\n", s, + (ptr->field == NULL) ? "NULL" : "valid value"); + da->field[s] = ptr->field; + s += 1; /* field[s++] failed... compiler optimization? */ + tmp = ptr->next; + ptr->next = NULL; + free(ptr); + ptr = tmp; + } + collection->heads[i]->first_node = NULL; + collection->heads[i]->last_node = NULL; + collection->heads[i]->size = 0; + } + + destroy_collection(collection); + + return da; +} + + +/* This is acceptable if the largest can be hold in memory, but not the other way */ +inline void apply_filter_to_array(struct data_array *array, unsigned char *data) { + size_t pos=0, limit = 0; + int cmp; + + + if (array==NULL || data==NULL) { + return; + } + + /* Manual search with entry_points */ + if (data[0] == 255) { + limit = array->array_length; + } else { + limit = array->entry_points[data[0]+1]; + } + + pos=array->opt_entry_points[data[0]]; + DBGTRC("DEBUG: opt_entry_point=%-8lu / limit=%-8lu\n", pos, limit); + DBGTRC("DEBUG: data[0]=0x%02X / decimal: data[0]=%3u\n", + data[0], (unsigned int)data[0]); + /* TODO: Evaluate whether this loop should be handlued with openmp */ + for (; pos<limit; pos++) { + DBGTRC("DEBUG: pos=%lu\n", pos); + if (array->field[pos]->active) { + /* perf optimization, single byte faster than memcmp on 20 bytes */ + if (data[1] < array->field[pos]->data[1]) { + continue; + } + cmp = memcmp(data, array->field[pos]->data, FULL_DATABLOCK_SIZE); + DBGTRC("cmp: %d\n", cmp); + if (cmp == 0) { + array->field[pos]->active = false; + if (pos == array->opt_entry_points[data[0]]) { + array->opt_entry_points[data[0]]++; + } + return; + } +#if 0 + else if (data[1] > array->field[pos]->data[1]) { + return; + } +#else + /* Currently I don't trust this memcmp result 100%. testdata created in shell + * for i in $(seq 1 20000); do echo "foo_$i" | sha1sum | awk ' { print $1; } ' ; done + * picking a random subset from there to be used as filters lead to sometimes leaving + * the loop too early with this else... Comparing only the second byte may is also too + * vague with really large datasets */ + else if (cmp < 0) { + /* array content is sorted, so abort once the possible match has been passed */ + return; + } +#endif + } + } +} + + +struct list_collection *import_file_into_collection(char *filename) { + FILE *fd = NULL; + char line[LINELENGTH]; + unsigned char data[FULL_DATABLOCK_SIZE]; + size_t line_nr = 0; + struct list_collection *collection = NULL; + + if (filename == NULL || filename[0] == '\0') { + LOGERR("ERROR: no filename given.\n"); + return NULL; + } + + if ((collection = create_collection()) == NULL) { + LOGERR("ERROR: Failed to allocate a few bytes for collection.\n"); + return NULL; + } + + if ((fd = fopen(filename, "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + filename, strerror(errno), errno); + destroy_collection(collection); + return NULL; + } + + while (fgets(line, LINELENGTH, fd) != NULL) { + line_nr++; + DBGTRC("DEBUG: line %lu = %s", line_nr, line); + if (convert_line(line)<0) { + LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); + destroy_collection(collection); + fclose(fd); + return NULL; + } + + if (convert_to_binary(line, data) == NULL) { + destroy_collection(collection); + fclose(fd); + LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); + return NULL; + } + + if (!insert_into_sorted_collection(collection, data)) { + destroy_collection(collection); + fclose(fd); + LOGERR("ERROR: Failed to insert line %lu in binary form into collection.\n", line_nr); + return NULL; + } + } + + DBGTRC("DEBUG: lines read %lu\n", line_nr); + fclose(fd); + + return collection; +} + + +bool run_filter_file_on_array(struct data_array *da, char *filename) { + FILE *fd = NULL; + size_t line_nr = 0; + char line[LINELENGTH]; + unsigned char data[FULL_DATABLOCK_SIZE]; + + if (da == NULL || filename == NULL || filename[0] == '\0') { + LOGERR("ERROR: compromised arguments at call.\n"); + return false; + } + if ((fd=fopen(filename, "r")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + filename, strerror(errno), errno); + return false; + } + + /* Be nice and skip and not abort */ + while (fgets(line, LINELENGTH, fd) != NULL) { + line_nr++; + DBGTRC("READ LINE %lu: %s", line_nr, line); + if (convert_line(line)<0) { + LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); + continue; + } + if (convert_to_binary(line, data) == NULL) { + LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); + continue; + } + apply_filter_to_array(da, (unsigned char*)data); + } + + fclose(fd); + + return true; +} + + +void fprint_array(FILE *fd, struct data_array *da) { + FILE *fdout = stdout; + char *plaintext = NULL; + size_t pos; + if (da == NULL) { return; } + if (fd != NULL) { fdout = fd; } + DBGTRC("DEBUG: da->array_length = %lu\n", da->array_length); + for (pos=0; pos < da->array_length; pos++) { + DBGTRC("DEBUG: pos %lu --- %s\n", pos, (da->field[pos] == NULL)? "NULL" : "element exists"); + if (da->field[pos]->active) { + plaintext = convert_from_binary(da->field[pos]->data, FULL_DATABLOCK_SIZE, plaintext); + fputs(plaintext, fdout); + fputc('\n', fdout); + } + } + if (plaintext != NULL) { free(plaintext); }; +} + +/* To be used on a new array */ +bool verify_data_array(struct data_array *da) { + bool res = true; + size_t p = 0; + int repeated_error = 0; + + if (da == NULL) { + LOGERR("No array given...\n"); + return false; + } + + /* Check entry points */ + for (p=0; p<256; p++) { + if (da->entry_points[p] != da->opt_entry_points[p]) { + LOGERR("MISMATCH entry_points[%lu] <-> opt_entry_points[%lu]\n", p, p); + res = false; + } + } + + for (p=0; p<da->array_length; p++) { + if (da->field[p] == NULL) { + LOGERR("ERROR: field %lu of %lu is NULL.\n", p, da->array_length); + res = false; + repeated_error++; + if (repeated_error > 5) + return res; + } + } + + return res; +} + + +bool verify_list_collection(struct list_collection *lc) { + bool res = true; + int i; + size_t count; + struct list_node *ptr; + + for (i=0; i<256; i++) { + count = 0; + ptr = lc->heads[i]->first_node; + + while (ptr != NULL) { + count++; + ptr = ptr->next; + } + + LOGERR("DEBUG: list %3d/0x%02X: size %lu, counted %lu => does%s match.\n", + i, i, lc->heads[i]->size, count, + (lc->heads[i]->size != count)?" NOT":""); + if (lc->heads[i]->size != count) { + res = false; + } + } + + return res; +} + + +#ifdef DEBUGBUILD +void print_collection_state(struct list_collection *coll) { + int i = 0; + struct list_head *h = NULL; + struct list_node *ptr = NULL; + + if (coll == NULL) { + LOGERR("ERROR: No collection given\n"); + return; + } + + for (i=0; i<256; i++) { + h = coll->heads[i]; + ptr = h->first_node; + if (h->size == 0 || ptr == NULL) { + if (!(h->size == 0 && ptr == NULL)) { + DBGTRC("DEBUG: ERROR item %3d not matching size %lu, first_node %p\n", i, h->size, h->first_node); + } + continue; + } + DBGTRC("DEBUG: List %3d, size %lu, first_node %p\n", i, h->size, h->first_node); + } + +} +#endif + +int main(int argc, char **argv) { + FILE *output = NULL; + int i = 0; + size_t s = 0; + struct list_collection *collection; + struct data_array *array; + + if (argc < 3) { + fprintf(stderr, "Usage: %s output raw_data filter_files...\n", argv[0]); + return EXIT_FAILURE; + } + + s = strlen(argv[1]); + if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { + output = stdout; + } else { + if ((output=fopen(argv[1], "w")) == NULL) { + LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", + argv[1], strerror(errno), errno); + return EXIT_FAILURE; + } + } + + LOGERR("IMPORT FILE %s\n", argv[2]); + collection = import_file_into_collection(argv[2]); + if (collection == NULL) { + return EXIT_FAILURE; + } +/* TODO: Hide this behind some option + if ( ! verify_list_collection(collection) ) { + return EXIT_FAILURE; + } +*/ + array = convert_collection_to_array(collection); + if (array == NULL) { + return EXIT_FAILURE; + } + collection = NULL; + +/* TODO: Hide this behind an option + if ( ! verify_data_array(array) ) { + return EXIT_FAILURE; + } +*/ + + for (i=3; i<argc; i++) { + LOGERR("APPLY FILTER FILE %s\n", argv[i]); + if (!run_filter_file_on_array(array, argv[i])) { + LOGERR("ERROR: Aborting.\n"); + return EXIT_FAILURE; + } + } + DBGTRC("DEBUG: Filter applied, final result:\n"); + fprint_array(output, array); + fflush(output); + fclose(output); + + destroy_data_array(array); + + return EXIT_SUCCESS; +} + |
