1 files changed, 675 insertions, 0 deletions
diff --git a/src/mem_internal_check.c b/src/mem_internal_check.c
new file mode 100644
index 0000000..a519f19
--- /dev/null
+++ b/src/mem_internal_check.c
@@ -0,0 +1,675 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * Compare two files with a single SHA1 hash per line
+ * useful when comparing filesystem content:
+ * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt
+ * awk ...
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+#include <ctype.h>
+#include <inttypes.h>
+
+#include "output.h"
+#include "hex_conversion.h"
+
+
+#ifndef LINELENGTH
+#define LINELENGTH 1024
+#endif
+
+/* 8 bytes pointer to a smaller char array? As there are strict conditions in
+ * regards to the data which will be processed work with fix size arrays and
+ * do math hard coded at compile time. */
+#define NON_CONVERTED_SIZE      40
+#define FULL_DATABLOCK_SIZE     (NON_CONVERTED_SIZE/2)
+
+
+struct data_field {
+    bool active;
+    unsigned char data[FULL_DATABLOCK_SIZE];
+};
+
+struct list_node {
+    struct list_node *next;
+    struct data_field *field;
+};
+
+struct list_head {
+    struct list_node *first_node;
+    struct list_node *last_node;
+    size_t size;
+};
+
+struct list_collection {
+    struct list_head *heads[256];
+};
+
+
+struct data_array {
+    struct data_field **field;
+    size_t entry_points[256]; /* keep constant */
+    size_t opt_entry_points[256]; /* increase if referred target is deactivated */
+    size_t array_length;
+};
+
+
+void apply_filter_to_array(struct data_array *array, unsigned char *data);
+int compare_data_fields(const void *a, const void *b);
+struct data_array *convert_collection_to_array(struct list_collection *collection);
+struct list_collection *create_collection();
+struct list_head *create_list_head();
+struct list_node *create_list_node(unsigned char *data);
+void destroy_collection(struct list_collection *c);
+void destroy_data_array(struct data_array *da);
+void destroy_list(struct list_head *head);
+void fprint_array(FILE *fd, struct data_array *da);
+struct list_collection *import_file_into_collection(char *filename);
+bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data);
+bool run_filter_file_on_array(struct data_array *da, char *filename);
+bool search_data_in_array(struct data_array *da, unsigned char *data);
+bool verify_data_array(struct data_array *da);
+bool verify_list_collection(struct list_collection *lc);
+
+#ifdef DEBUGBUILD
+void print_collection_state(struct list_collection *coll);
+#endif
+
+
+/* === IMPLEMENTATION === */
+
+inline struct list_collection *create_collection() {
+    int i;
+    struct list_collection *c;
+
+    if ((c=calloc(1, sizeof(struct list_collection))) == NULL) {
+        return NULL;
+    }
+    for (i=0; i<256; i++) {
+        c->heads[i] = create_list_head();
+        if (c->heads[i] == NULL) { break; }
+    }
+    DBGTRC("DEBUG: %d lists initialized%s\n", i,
+            (i<256)?", aborting as not all 256 where" : "");
+    return c;
+}
+
+
+inline struct list_head *create_list_head() {
+    struct list_head *head = calloc(1, sizeof(struct list_head));
+    if (head == NULL) {
+        /* If the following masked fprintf calls fail there was a chance */
+        LOGERR("ERROR: Failed to allocate memory for list_head: ");
+        fputs(strerror(errno), stderr);
+        fputc('\n', stderr);
+        return NULL;
+    }
+    head->first_node = NULL;
+    head->last_node = NULL;
+    head->size = 0;
+    return head;
+}
+
+/* bsearch and similar  */
+int compare_data_fields(const void *a, const void *b) {
+    return memcmp(((struct data_field*)a)->data,
+            ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE);
+}
+
+
+inline bool search_data_in_array(struct data_array *array, unsigned char *data) {
+    struct data_field *df, tmp_df;
+    memcpy(tmp_df.data, data, FULL_DATABLOCK_SIZE);
+    /* Currently not working */
+    df = bsearch(&tmp_df, array->field,
+            array->array_length, sizeof(struct data_field),
+            compare_data_fields);
+    if (df != NULL) {
+        return true;
+    }
+    return false;
+}
+
+
+inline void destroy_collection(struct list_collection *c) {
+    int i;
+    if (c == NULL) return;
+    for (i=0; i<256; i++) {
+        destroy_list(c->heads[i]);
+    }
+    free(c);
+}
+
+
+inline void destroy_list(struct list_head *head) {
+    struct list_node *ptr=NULL;
+    if (head == NULL) return;
+    while (head->first_node !=  NULL) {
+        ptr = head->first_node;
+        head->first_node = head->first_node->next;
+        if (ptr->field != NULL)
+            free(ptr->field);
+        free(ptr);
+    }
+    free(head);
+    head = NULL;
+}
+
+inline void destroy_data_array(struct data_array *da) {
+    size_t i=0;
+    if (da == NULL) return;
+    for (;i<da->array_length; i++) {
+        if (da->field[i] != NULL) {
+            free(da->field[i]);
+        }
+    }
+    free(da->field);
+    free(da);
+}
+
+inline struct list_node *create_list_node(unsigned char *data) {
+    struct list_node *node;
+    if (data == NULL) return NULL;
+    if ((node=calloc(1,sizeof(struct list_node))) == NULL) {
+        LOGERR("ERROR: Failed to allocate a few bytes for new list_node\n");
+        return NULL;
+    }
+    if ((node->field=calloc(1,sizeof(struct data_field))) == NULL) {
+        LOGERR("ERROR: Failed to allocate a few bytes for new data_field\n");
+        free(node);
+        return NULL;
+    }
+    node->next = NULL;
+    node->field->active = true;
+    memcpy(node->field->data, data, FULL_DATABLOCK_SIZE);
+    return node;
+}
+
+
+/* If the input data's already sorted best performance, worst case (O^n)/256 if it's reverse sorted. */
+inline bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data) {
+    struct list_node *node_ptr = NULL, *tmp_node = NULL;
+    struct list_head *head_ptr = NULL;
+    int cmp = 0;
+#ifdef DEBUGBUILD
+    static size_t fcall_count = 0;
+#endif
+
+    if (collection == NULL || data == NULL) {
+        DBGTRC("ERROR: collection set%s / data set%s\n",
+                ((collection==NULL) ? " NULL" : ""),
+                ((data==NULL)       ? " NULL" : ""));
+        return false;
+    }
+
+#ifdef DEBUGBUILD
+    DBGTRC("DEBUG: fcall_count = %lu / data[0]=0x%02X / decimal: data[0]=%3u\n",
+            ++fcall_count, data[0], (unsigned int)data[0]);
+#endif
+
+    head_ptr = collection->heads[data[0]];
+    if (head_ptr->first_node == NULL) {
+        DBGTRC("DEBUG: First node for list %d\n", data[0]);
+        if ((tmp_node = create_list_node(data)) == NULL) {
+            return false;
+        }
+        head_ptr->first_node = tmp_node;
+        head_ptr->last_node = tmp_node;
+        head_ptr->size = 1;
+        return true;
+    }
+
+    cmp = memcmp(data, head_ptr->last_node->field->data, FULL_DATABLOCK_SIZE);
+    if (cmp == 0) {
+        /* duplicate, skip */
+        return true;
+    }
+
+    /* Append to list */
+    if (cmp > 0) {
+        if ((tmp_node = create_list_node(data)) == NULL) {
+            return false;
+        }
+        head_ptr->last_node->next = tmp_node;
+        head_ptr->last_node = tmp_node;
+        head_ptr->size++;
+        return true;
+    }
+
+    node_ptr = head_ptr->first_node;
+    cmp = memcmp(data, node_ptr->field->data, FULL_DATABLOCK_SIZE);
+    if (cmp == 0) { return true; }
+    /* First to enter? Prepend to list*/
+    if (cmp < 0) {
+        if ((tmp_node = create_list_node(data)) == NULL) {
+            return false;
+        }
+        tmp_node->next = head_ptr->first_node;
+        head_ptr->first_node = tmp_node;
+        head_ptr->size++;
+        return true;
+    }
+
+    /* Sloooow part... */
+    while (node_ptr->next != NULL) {
+        /* check next in list */
+        cmp = memcmp(data, node_ptr->next->field->data, FULL_DATABLOCK_SIZE);
+        if (cmp == 0) { return true; }
+        /* insert by prepend */
+        if (cmp > 0) {
+            if ((tmp_node = create_list_node(data)) == NULL) {
+                return false;
+            }
+            tmp_node->next = node_ptr->next;
+            node_ptr->next = tmp_node;
+            head_ptr->size++;
+            return true;
+        }
+        /* continue */
+        node_ptr = node_ptr->next;
+    }
+    LOGERR("ERROR: Could not insert dataset into list, missed possible situation...\n");
+    return false;
+}
+
+
+inline struct data_array *convert_collection_to_array(struct list_collection *collection) {
+    int i;
+    size_t s;
+    struct data_array *da = NULL;
+    struct list_node *ptr, *tmp;
+
+    if (collection == NULL) {
+        LOGERR("ERROR: No collection given.\n");
+        return NULL;
+    }
+
+    s = 0;
+    for (i=0; i<256; i++) {
+#ifdef DEBUGBUILD
+        if (collection->heads[i]->size > 0) {
+            DBGTRC("DEBUG: collections->heads[%d]->size = %lu\n",
+                i, collection->heads[i]->size);
+        }
+#endif
+        s += collection->heads[i]->size;
+        DBGTRC("DEBUG: collection->heads[%d]->size = %lu\n", i, collection->heads[i]->size);
+    }
+    DBGTRC("DEBUG: Total size %lu\n", s);
+
+    if (s == 0) {
+        LOGERR("ERROR: Empty collection.");
+        return NULL;
+    }
+
+    if ((da = calloc(1, sizeof(struct data_array))) == NULL) {
+        LOGERR("ERROR: Failed to allocate %lu bytes...\n", sizeof(struct data_array));
+        return NULL;
+    }
+
+    da->array_length = s;
+    if ((da->field=calloc(da->array_length, sizeof(struct data_field*))) == NULL) {
+        free(da);
+        LOGERR("ERROR: Failed to allocate memory for metadata\n");
+        return NULL;
+    }
+
+    s = 0;
+    for (i=0; i<256; i++) {
+        DBGTRC("DEBUG: i=%d / s=%lu\n", i, s);
+        ptr = collection->heads[i]->first_node;
+        da->entry_points[i] = s;
+        da->opt_entry_points[i] = s;
+#ifdef DEBUGBUILD
+        if (ptr != NULL) {
+            DBGTRC("DEBUG: list %d in collection, claims size %lu\n", i,
+                    collection->heads[i]->size);
+        }
+#endif
+        while (ptr != NULL) {
+            DBGTRC("DEBUG: Setting field %lu to %s\n", s,
+                    (ptr->field == NULL) ? "NULL" : "valid value");
+            da->field[s] = ptr->field;
+            s += 1; /* field[s++] failed... compiler optimization? */
+            tmp = ptr->next;
+            ptr->next = NULL;
+            free(ptr);
+            ptr = tmp;
+        }
+        collection->heads[i]->first_node = NULL;
+        collection->heads[i]->last_node = NULL;
+        collection->heads[i]->size = 0;
+    }
+
+    destroy_collection(collection);
+
+    return da;
+}
+
+
+/* This is acceptable if the largest can be hold in memory, but not the other way */
+inline void apply_filter_to_array(struct data_array *array, unsigned char *data) {
+    size_t pos=0, limit = 0;
+    int cmp;
+
+
+    if (array==NULL || data==NULL) {
+        return;
+    }
+
+    /* Manual search with entry_points */
+    if (data[0] == 255) {
+        limit = array->array_length;
+    } else {
+        limit = array->entry_points[data[0]+1];
+    }
+
+    pos=array->opt_entry_points[data[0]];
+    DBGTRC("DEBUG: opt_entry_point=%-8lu / limit=%-8lu\n", pos, limit);
+    DBGTRC("DEBUG: data[0]=0x%02X / decimal: data[0]=%3u\n",
+            data[0], (unsigned int)data[0]);
+    /* TODO: Evaluate whether this loop should be handlued with openmp */
+    for (; pos<limit; pos++) {
+        DBGTRC("DEBUG: pos=%lu\n", pos);
+        if (array->field[pos]->active) {
+            /* perf optimization, single byte faster than memcmp on 20 bytes */
+            if (data[1] < array->field[pos]->data[1]) {
+                continue;
+            }
+            cmp = memcmp(data, array->field[pos]->data, FULL_DATABLOCK_SIZE);
+            DBGTRC("cmp: %d\n", cmp);
+            if (cmp == 0) {
+                array->field[pos]->active = false;
+                if (pos == array->opt_entry_points[data[0]]) {
+                    array->opt_entry_points[data[0]]++;
+                }
+                return;
+            }
+#if 0
+            else if (data[1] > array->field[pos]->data[1]) {
+                return;
+            }
+#else
+            /* Currently I don't trust this memcmp result 100%. testdata created in shell
+             * for i in $(seq 1 20000); do echo "foo_$i" | sha1sum | awk ' { print $1; } ' ; done
+             * picking a random subset from there to be used as filters lead to sometimes leaving
+             * the loop too early with this else... Comparing only the second byte may is also too
+             * vague with really large datasets */
+            else if (cmp < 0) {
+                /* array content is sorted, so abort once the possible match has been passed */
+                return;
+            }
+#endif
+        }
+    }
+}
+
+
+struct list_collection *import_file_into_collection(char *filename) {
+    FILE *fd = NULL;
+    char line[LINELENGTH];
+    unsigned char data[FULL_DATABLOCK_SIZE];
+    size_t line_nr = 0;
+    struct list_collection *collection = NULL;
+
+    if (filename == NULL || filename[0] == '\0') {
+        LOGERR("ERROR: no filename given.\n");
+        return NULL;
+    }
+
+    if ((collection = create_collection()) == NULL) {
+        LOGERR("ERROR: Failed to allocate a few bytes for collection.\n");
+        return NULL;
+    }
+
+    if ((fd = fopen(filename, "r")) == NULL) {
+        LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+                filename, strerror(errno), errno);
+        destroy_collection(collection);
+        return NULL;
+    }
+
+    while (fgets(line, LINELENGTH, fd) != NULL) {
+        line_nr++;
+        DBGTRC("DEBUG: line %lu = %s", line_nr, line);
+        if (convert_line(line)<0) {
+            LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr);
+            destroy_collection(collection);
+            fclose(fd);
+            return NULL;
+        }
+
+        if (convert_to_binary(line, data) == NULL) {
+            destroy_collection(collection);
+            fclose(fd);
+            LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr);
+            return NULL;
+        }
+
+        if (!insert_into_sorted_collection(collection, data)) {
+            destroy_collection(collection);
+            fclose(fd);
+            LOGERR("ERROR: Failed to insert line %lu in binary form into collection.\n", line_nr);
+            return NULL;
+        }
+    }
+
+    DBGTRC("DEBUG: lines read %lu\n", line_nr);
+    fclose(fd);
+
+    return collection;
+}
+
+
+bool run_filter_file_on_array(struct data_array *da, char *filename) {
+    FILE *fd = NULL;
+    size_t line_nr = 0;
+    char line[LINELENGTH];
+    unsigned char data[FULL_DATABLOCK_SIZE];
+
+    if (da == NULL || filename == NULL || filename[0] == '\0') {
+        LOGERR("ERROR: compromised arguments at call.\n");
+        return false;
+    }
+    if ((fd=fopen(filename, "r")) == NULL) {
+        LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+                filename, strerror(errno), errno);
+        return false;
+    }
+
+    /* Be nice and skip and not abort */
+    while (fgets(line, LINELENGTH, fd) != NULL) {
+        line_nr++;
+        DBGTRC("READ LINE %lu: %s", line_nr, line);
+        if (convert_line(line)<0) {
+            LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr);
+            continue;
+        }
+        if (convert_to_binary(line, data) == NULL) {
+            LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr);
+            continue;
+        }
+        apply_filter_to_array(da, (unsigned char*)data);
+    }
+
+    fclose(fd);
+
+    return true;
+}
+
+
+void fprint_array(FILE *fd, struct data_array *da) {
+    FILE *fdout = stdout;
+    char *plaintext = NULL;
+    size_t pos;
+    if (da == NULL) { return; }
+    if (fd != NULL) { fdout = fd; }
+    DBGTRC("DEBUG: da->array_length = %lu\n", da->array_length);
+    for (pos=0; pos < da->array_length; pos++) {
+        DBGTRC("DEBUG: pos %lu --- %s\n", pos, (da->field[pos] == NULL)? "NULL" : "element exists");
+        if (da->field[pos]->active) {
+            plaintext = convert_from_binary(da->field[pos]->data, FULL_DATABLOCK_SIZE, plaintext);
+            fputs(plaintext, fdout);
+            fputc('\n', fdout);
+        }
+    }
+    if (plaintext != NULL) { free(plaintext); };
+}
+
+/* To be used on a new array */
+bool verify_data_array(struct data_array *da) {
+    bool res = true;
+    size_t p = 0;
+    int repeated_error = 0;
+    
+    if (da == NULL) {
+        LOGERR("No array given...\n");
+        return false;
+    }
+
+    /* Check entry points */
+    for (p=0; p<256; p++) {
+        if (da->entry_points[p] != da->opt_entry_points[p]) {
+            LOGERR("MISMATCH entry_points[%lu] <-> opt_entry_points[%lu]\n", p, p);
+            res = false;
+        }
+    }
+
+    for (p=0; p<da->array_length; p++) {
+        if (da->field[p] == NULL) {
+            LOGERR("ERROR: field %lu of %lu is NULL.\n", p, da->array_length);
+            res = false;
+            repeated_error++;
+            if (repeated_error > 5)
+                return res;
+        }
+    }
+
+    return res;
+}
+
+
+bool verify_list_collection(struct list_collection *lc) {
+    bool res = true;
+    int i;
+    size_t count;
+    struct list_node *ptr;
+
+    for (i=0; i<256; i++) {
+        count = 0;
+        ptr = lc->heads[i]->first_node;
+
+        while (ptr != NULL) {
+            count++;
+            ptr = ptr->next;
+        }
+
+        LOGERR("DEBUG: list %3d/0x%02X: size %lu, counted %lu => does%s match.\n",
+                i, i, lc->heads[i]->size, count,
+                (lc->heads[i]->size != count)?" NOT":"");
+        if (lc->heads[i]->size != count) {
+            res = false;
+        }
+    }
+
+    return res;
+}
+
+
+#ifdef DEBUGBUILD
+void print_collection_state(struct list_collection *coll) {
+    int i = 0;
+    struct list_head *h = NULL;
+    struct list_node *ptr = NULL;
+
+    if (coll == NULL) {
+        LOGERR("ERROR: No collection given\n");
+        return;
+    }
+
+    for (i=0; i<256; i++) {
+        h = coll->heads[i];
+        ptr = h->first_node;
+        if (h->size == 0 || ptr == NULL) {
+            if (!(h->size == 0 && ptr == NULL)) {
+                DBGTRC("DEBUG: ERROR item %3d not matching size %lu, first_node %p\n", i, h->size, h->first_node);
+            }
+            continue;
+        }
+        DBGTRC("DEBUG: List %3d, size %lu, first_node %p\n", i, h->size, h->first_node);
+    }
+
+}
+#endif
+
+int main(int argc, char **argv) {
+    FILE *output = NULL;
+    int i = 0;
+    size_t s = 0;
+    struct list_collection *collection;
+    struct data_array *array;
+
+    if (argc < 3) {
+        fprintf(stderr, "Usage: %s output raw_data filter_files...\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    s = strlen(argv[1]);
+    if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) {
+        output = stdout;
+    } else {
+        if ((output=fopen(argv[1], "w")) == NULL) {
+            LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+                    argv[1], strerror(errno), errno);
+            return EXIT_FAILURE;
+        }
+    }
+
+    LOGERR("IMPORT FILE %s\n", argv[2]);
+    collection = import_file_into_collection(argv[2]);
+    if (collection == NULL) {
+        return EXIT_FAILURE;
+    }
+/* TODO: Hide this behind some option
+    if ( ! verify_list_collection(collection) ) {
+        return EXIT_FAILURE;
+    }
+*/
+    array = convert_collection_to_array(collection);
+    if (array == NULL) {
+        return EXIT_FAILURE;
+    }
+    collection = NULL;
+
+/* TODO: Hide this behind an option
+    if ( ! verify_data_array(array) ) {
+        return EXIT_FAILURE;
+    }
+*/
+
+    for (i=3; i<argc; i++) {
+        LOGERR("APPLY FILTER FILE %s\n", argv[i]);
+        if (!run_filter_file_on_array(array, argv[i])) {
+            LOGERR("ERROR: Aborting.\n");
+            return EXIT_FAILURE;
+        }
+    }
+    DBGTRC("DEBUG: Filter applied, final result:\n");
+    fprint_array(output, array);
+    fflush(output);
+    fclose(output);
+
+    destroy_data_array(array);
+
+    return EXIT_SUCCESS;
+}
+