aboutsummaryrefslogtreecommitdiff
path: root/src/mem_internal_check.c
diff options
context:
space:
mode:
authorThorsten Töpper <atsutane@freethoughts.de>2025-08-19 23:48:17 +0200
committerThorsten Töpper <atsutane@freethoughts.de>2025-08-19 23:48:17 +0200
commit1dd1d4291254ef87e0101a9e6a7da8618c64de1d (patch)
tree95671e881c68043eec78de70cbf6ac66ba089d89 /src/mem_internal_check.c
parent79cf7bedba215bc5b0b8e89c020ebb06e6038f8e (diff)
downloadsmall-utils-1dd1d4291254ef87e0101a9e6a7da8618c64de1d.tar.gz
small-utils-1dd1d4291254ef87e0101a9e6a7da8618c64de1d.tar.bz2
mem_internal_check: alternative to tree_based_check
Diffstat (limited to 'src/mem_internal_check.c')
-rw-r--r--src/mem_internal_check.c675
1 files changed, 675 insertions, 0 deletions
diff --git a/src/mem_internal_check.c b/src/mem_internal_check.c
new file mode 100644
index 0000000..a519f19
--- /dev/null
+++ b/src/mem_internal_check.c
@@ -0,0 +1,675 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2025 Thorsten Töpper
+ *
+ * Compare two files with a single SHA1 hash per line
+ * useful when comparing filesystem content:
+ * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt
+ * awk ...
+ *
+ * vim:ts=4:sw=4:expandtab
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+#include <ctype.h>
+#include <inttypes.h>
+
+#include "output.h"
+#include "hex_conversion.h"
+
+
+#ifndef LINELENGTH
+#define LINELENGTH 1024
+#endif
+
+/* 8 bytes pointer to a smaller char array? As there are strict conditions in
+ * regards to the data which will be processed work with fix size arrays and
+ * do math hard coded at compile time. */
+#define NON_CONVERTED_SIZE 40
+#define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2)
+
+
+struct data_field {
+ bool active;
+ unsigned char data[FULL_DATABLOCK_SIZE];
+};
+
+struct list_node {
+ struct list_node *next;
+ struct data_field *field;
+};
+
+struct list_head {
+ struct list_node *first_node;
+ struct list_node *last_node;
+ size_t size;
+};
+
+struct list_collection {
+ struct list_head *heads[256];
+};
+
+
+struct data_array {
+ struct data_field **field;
+ size_t entry_points[256]; /* keep constant */
+ size_t opt_entry_points[256]; /* increase if referred target is deactivated */
+ size_t array_length;
+};
+
+
+void apply_filter_to_array(struct data_array *array, unsigned char *data);
+int compare_data_fields(const void *a, const void *b);
+struct data_array *convert_collection_to_array(struct list_collection *collection);
+struct list_collection *create_collection();
+struct list_head *create_list_head();
+struct list_node *create_list_node(unsigned char *data);
+void destroy_collection(struct list_collection *c);
+void destroy_data_array(struct data_array *da);
+void destroy_list(struct list_head *head);
+void fprint_array(FILE *fd, struct data_array *da);
+struct list_collection *import_file_into_collection(char *filename);
+bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data);
+bool run_filter_file_on_array(struct data_array *da, char *filename);
+bool search_data_in_array(struct data_array *da, unsigned char *data);
+bool verify_data_array(struct data_array *da);
+bool verify_list_collection(struct list_collection *lc);
+
+#ifdef DEBUGBUILD
+void print_collection_state(struct list_collection *coll);
+#endif
+
+
+/* === IMPLEMENTATION === */
+
+inline struct list_collection *create_collection() {
+ int i;
+ struct list_collection *c;
+
+ if ((c=calloc(1, sizeof(struct list_collection))) == NULL) {
+ return NULL;
+ }
+ for (i=0; i<256; i++) {
+ c->heads[i] = create_list_head();
+ if (c->heads[i] == NULL) { break; }
+ }
+ DBGTRC("DEBUG: %d lists initialized%s\n", i,
+ (i<256)?", aborting as not all 256 where" : "");
+ return c;
+}
+
+
+inline struct list_head *create_list_head() {
+ struct list_head *head = calloc(1, sizeof(struct list_head));
+ if (head == NULL) {
+ /* If the following masked fprintf calls fail there was a chance */
+ LOGERR("ERROR: Failed to allocate memory for list_head: ");
+ fputs(strerror(errno), stderr);
+ fputc('\n', stderr);
+ return NULL;
+ }
+ head->first_node = NULL;
+ head->last_node = NULL;
+ head->size = 0;
+ return head;
+}
+
+/* bsearch and similar */
+int compare_data_fields(const void *a, const void *b) {
+ return memcmp(((struct data_field*)a)->data,
+ ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE);
+}
+
+
+inline bool search_data_in_array(struct data_array *array, unsigned char *data) {
+ struct data_field *df, tmp_df;
+ memcpy(tmp_df.data, data, FULL_DATABLOCK_SIZE);
+ /* Currently not working */
+ df = bsearch(&tmp_df, array->field,
+ array->array_length, sizeof(struct data_field),
+ compare_data_fields);
+ if (df != NULL) {
+ return true;
+ }
+ return false;
+}
+
+
+inline void destroy_collection(struct list_collection *c) {
+ int i;
+ if (c == NULL) return;
+ for (i=0; i<256; i++) {
+ destroy_list(c->heads[i]);
+ }
+ free(c);
+}
+
+
+inline void destroy_list(struct list_head *head) {
+ struct list_node *ptr=NULL;
+ if (head == NULL) return;
+ while (head->first_node != NULL) {
+ ptr = head->first_node;
+ head->first_node = head->first_node->next;
+ if (ptr->field != NULL)
+ free(ptr->field);
+ free(ptr);
+ }
+ free(head);
+ head = NULL;
+}
+
+inline void destroy_data_array(struct data_array *da) {
+ size_t i=0;
+ if (da == NULL) return;
+ for (;i<da->array_length; i++) {
+ if (da->field[i] != NULL) {
+ free(da->field[i]);
+ }
+ }
+ free(da->field);
+ free(da);
+}
+
+inline struct list_node *create_list_node(unsigned char *data) {
+ struct list_node *node;
+ if (data == NULL) return NULL;
+ if ((node=calloc(1,sizeof(struct list_node))) == NULL) {
+ LOGERR("ERROR: Failed to allocate a few bytes for new list_node\n");
+ return NULL;
+ }
+ if ((node->field=calloc(1,sizeof(struct data_field))) == NULL) {
+ LOGERR("ERROR: Failed to allocate a few bytes for new data_field\n");
+ free(node);
+ return NULL;
+ }
+ node->next = NULL;
+ node->field->active = true;
+ memcpy(node->field->data, data, FULL_DATABLOCK_SIZE);
+ return node;
+}
+
+
+/* If the input data's already sorted best performance, worst case (O^n)/256 if it's reverse sorted. */
+inline bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data) {
+ struct list_node *node_ptr = NULL, *tmp_node = NULL;
+ struct list_head *head_ptr = NULL;
+ int cmp = 0;
+#ifdef DEBUGBUILD
+ static size_t fcall_count = 0;
+#endif
+
+ if (collection == NULL || data == NULL) {
+ DBGTRC("ERROR: collection set%s / data set%s\n",
+ ((collection==NULL) ? " NULL" : ""),
+ ((data==NULL) ? " NULL" : ""));
+ return false;
+ }
+
+#ifdef DEBUGBUILD
+ DBGTRC("DEBUG: fcall_count = %lu / data[0]=0x%02X / decimal: data[0]=%3u\n",
+ ++fcall_count, data[0], (unsigned int)data[0]);
+#endif
+
+ head_ptr = collection->heads[data[0]];
+ if (head_ptr->first_node == NULL) {
+ DBGTRC("DEBUG: First node for list %d\n", data[0]);
+ if ((tmp_node = create_list_node(data)) == NULL) {
+ return false;
+ }
+ head_ptr->first_node = tmp_node;
+ head_ptr->last_node = tmp_node;
+ head_ptr->size = 1;
+ return true;
+ }
+
+ cmp = memcmp(data, head_ptr->last_node->field->data, FULL_DATABLOCK_SIZE);
+ if (cmp == 0) {
+ /* duplicate, skip */
+ return true;
+ }
+
+ /* Append to list */
+ if (cmp > 0) {
+ if ((tmp_node = create_list_node(data)) == NULL) {
+ return false;
+ }
+ head_ptr->last_node->next = tmp_node;
+ head_ptr->last_node = tmp_node;
+ head_ptr->size++;
+ return true;
+ }
+
+ node_ptr = head_ptr->first_node;
+ cmp = memcmp(data, node_ptr->field->data, FULL_DATABLOCK_SIZE);
+ if (cmp == 0) { return true; }
+ /* First to enter? Prepend to list*/
+ if (cmp < 0) {
+ if ((tmp_node = create_list_node(data)) == NULL) {
+ return false;
+ }
+ tmp_node->next = head_ptr->first_node;
+ head_ptr->first_node = tmp_node;
+ head_ptr->size++;
+ return true;
+ }
+
+ /* Sloooow part... */
+ while (node_ptr->next != NULL) {
+ /* check next in list */
+ cmp = memcmp(data, node_ptr->next->field->data, FULL_DATABLOCK_SIZE);
+ if (cmp == 0) { return true; }
+ /* insert by prepend */
+ if (cmp > 0) {
+ if ((tmp_node = create_list_node(data)) == NULL) {
+ return false;
+ }
+ tmp_node->next = node_ptr->next;
+ node_ptr->next = tmp_node;
+ head_ptr->size++;
+ return true;
+ }
+ /* continue */
+ node_ptr = node_ptr->next;
+ }
+ LOGERR("ERROR: Could not insert dataset into list, missed possible situation...\n");
+ return false;
+}
+
+
+inline struct data_array *convert_collection_to_array(struct list_collection *collection) {
+ int i;
+ size_t s;
+ struct data_array *da = NULL;
+ struct list_node *ptr, *tmp;
+
+ if (collection == NULL) {
+ LOGERR("ERROR: No collection given.\n");
+ return NULL;
+ }
+
+ s = 0;
+ for (i=0; i<256; i++) {
+#ifdef DEBUGBUILD
+ if (collection->heads[i]->size > 0) {
+ DBGTRC("DEBUG: collections->heads[%d]->size = %lu\n",
+ i, collection->heads[i]->size);
+ }
+#endif
+ s += collection->heads[i]->size;
+ DBGTRC("DEBUG: collection->heads[%d]->size = %lu\n", i, collection->heads[i]->size);
+ }
+ DBGTRC("DEBUG: Total size %lu\n", s);
+
+ if (s == 0) {
+ LOGERR("ERROR: Empty collection.");
+ return NULL;
+ }
+
+ if ((da = calloc(1, sizeof(struct data_array))) == NULL) {
+ LOGERR("ERROR: Failed to allocate %lu bytes...\n", sizeof(struct data_array));
+ return NULL;
+ }
+
+ da->array_length = s;
+ if ((da->field=calloc(da->array_length, sizeof(struct data_field*))) == NULL) {
+ free(da);
+ LOGERR("ERROR: Failed to allocate memory for metadata\n");
+ return NULL;
+ }
+
+ s = 0;
+ for (i=0; i<256; i++) {
+ DBGTRC("DEBUG: i=%d / s=%lu\n", i, s);
+ ptr = collection->heads[i]->first_node;
+ da->entry_points[i] = s;
+ da->opt_entry_points[i] = s;
+#ifdef DEBUGBUILD
+ if (ptr != NULL) {
+ DBGTRC("DEBUG: list %d in collection, claims size %lu\n", i,
+ collection->heads[i]->size);
+ }
+#endif
+ while (ptr != NULL) {
+ DBGTRC("DEBUG: Setting field %lu to %s\n", s,
+ (ptr->field == NULL) ? "NULL" : "valid value");
+ da->field[s] = ptr->field;
+ s += 1; /* field[s++] failed... compiler optimization? */
+ tmp = ptr->next;
+ ptr->next = NULL;
+ free(ptr);
+ ptr = tmp;
+ }
+ collection->heads[i]->first_node = NULL;
+ collection->heads[i]->last_node = NULL;
+ collection->heads[i]->size = 0;
+ }
+
+ destroy_collection(collection);
+
+ return da;
+}
+
+
+/* This is acceptable if the largest can be hold in memory, but not the other way */
+inline void apply_filter_to_array(struct data_array *array, unsigned char *data) {
+ size_t pos=0, limit = 0;
+ int cmp;
+
+
+ if (array==NULL || data==NULL) {
+ return;
+ }
+
+ /* Manual search with entry_points */
+ if (data[0] == 255) {
+ limit = array->array_length;
+ } else {
+ limit = array->entry_points[data[0]+1];
+ }
+
+ pos=array->opt_entry_points[data[0]];
+ DBGTRC("DEBUG: opt_entry_point=%-8lu / limit=%-8lu\n", pos, limit);
+ DBGTRC("DEBUG: data[0]=0x%02X / decimal: data[0]=%3u\n",
+ data[0], (unsigned int)data[0]);
+ /* TODO: Evaluate whether this loop should be handlued with openmp */
+ for (; pos<limit; pos++) {
+ DBGTRC("DEBUG: pos=%lu\n", pos);
+ if (array->field[pos]->active) {
+ /* perf optimization, single byte faster than memcmp on 20 bytes */
+ if (data[1] < array->field[pos]->data[1]) {
+ continue;
+ }
+ cmp = memcmp(data, array->field[pos]->data, FULL_DATABLOCK_SIZE);
+ DBGTRC("cmp: %d\n", cmp);
+ if (cmp == 0) {
+ array->field[pos]->active = false;
+ if (pos == array->opt_entry_points[data[0]]) {
+ array->opt_entry_points[data[0]]++;
+ }
+ return;
+ }
+#if 0
+ else if (data[1] > array->field[pos]->data[1]) {
+ return;
+ }
+#else
+ /* Currently I don't trust this memcmp result 100%. testdata created in shell
+ * for i in $(seq 1 20000); do echo "foo_$i" | sha1sum | awk ' { print $1; } ' ; done
+ * picking a random subset from there to be used as filters lead to sometimes leaving
+ * the loop too early with this else... Comparing only the second byte may is also too
+ * vague with really large datasets */
+ else if (cmp < 0) {
+ /* array content is sorted, so abort once the possible match has been passed */
+ return;
+ }
+#endif
+ }
+ }
+}
+
+
+struct list_collection *import_file_into_collection(char *filename) {
+ FILE *fd = NULL;
+ char line[LINELENGTH];
+ unsigned char data[FULL_DATABLOCK_SIZE];
+ size_t line_nr = 0;
+ struct list_collection *collection = NULL;
+
+ if (filename == NULL || filename[0] == '\0') {
+ LOGERR("ERROR: no filename given.\n");
+ return NULL;
+ }
+
+ if ((collection = create_collection()) == NULL) {
+ LOGERR("ERROR: Failed to allocate a few bytes for collection.\n");
+ return NULL;
+ }
+
+ if ((fd = fopen(filename, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ filename, strerror(errno), errno);
+ destroy_collection(collection);
+ return NULL;
+ }
+
+ while (fgets(line, LINELENGTH, fd) != NULL) {
+ line_nr++;
+ DBGTRC("DEBUG: line %lu = %s", line_nr, line);
+ if (convert_line(line)<0) {
+ LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr);
+ destroy_collection(collection);
+ fclose(fd);
+ return NULL;
+ }
+
+ if (convert_to_binary(line, data) == NULL) {
+ destroy_collection(collection);
+ fclose(fd);
+ LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr);
+ return NULL;
+ }
+
+ if (!insert_into_sorted_collection(collection, data)) {
+ destroy_collection(collection);
+ fclose(fd);
+ LOGERR("ERROR: Failed to insert line %lu in binary form into collection.\n", line_nr);
+ return NULL;
+ }
+ }
+
+ DBGTRC("DEBUG: lines read %lu\n", line_nr);
+ fclose(fd);
+
+ return collection;
+}
+
+
+bool run_filter_file_on_array(struct data_array *da, char *filename) {
+ FILE *fd = NULL;
+ size_t line_nr = 0;
+ char line[LINELENGTH];
+ unsigned char data[FULL_DATABLOCK_SIZE];
+
+ if (da == NULL || filename == NULL || filename[0] == '\0') {
+ LOGERR("ERROR: compromised arguments at call.\n");
+ return false;
+ }
+ if ((fd=fopen(filename, "r")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ filename, strerror(errno), errno);
+ return false;
+ }
+
+ /* Be nice and skip and not abort */
+ while (fgets(line, LINELENGTH, fd) != NULL) {
+ line_nr++;
+ DBGTRC("READ LINE %lu: %s", line_nr, line);
+ if (convert_line(line)<0) {
+ LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr);
+ continue;
+ }
+ if (convert_to_binary(line, data) == NULL) {
+ LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr);
+ continue;
+ }
+ apply_filter_to_array(da, (unsigned char*)data);
+ }
+
+ fclose(fd);
+
+ return true;
+}
+
+
+void fprint_array(FILE *fd, struct data_array *da) {
+ FILE *fdout = stdout;
+ char *plaintext = NULL;
+ size_t pos;
+ if (da == NULL) { return; }
+ if (fd != NULL) { fdout = fd; }
+ DBGTRC("DEBUG: da->array_length = %lu\n", da->array_length);
+ for (pos=0; pos < da->array_length; pos++) {
+ DBGTRC("DEBUG: pos %lu --- %s\n", pos, (da->field[pos] == NULL)? "NULL" : "element exists");
+ if (da->field[pos]->active) {
+ plaintext = convert_from_binary(da->field[pos]->data, FULL_DATABLOCK_SIZE, plaintext);
+ fputs(plaintext, fdout);
+ fputc('\n', fdout);
+ }
+ }
+ if (plaintext != NULL) { free(plaintext); };
+}
+
+/* To be used on a new array */
+bool verify_data_array(struct data_array *da) {
+ bool res = true;
+ size_t p = 0;
+ int repeated_error = 0;
+
+ if (da == NULL) {
+ LOGERR("No array given...\n");
+ return false;
+ }
+
+ /* Check entry points */
+ for (p=0; p<256; p++) {
+ if (da->entry_points[p] != da->opt_entry_points[p]) {
+ LOGERR("MISMATCH entry_points[%lu] <-> opt_entry_points[%lu]\n", p, p);
+ res = false;
+ }
+ }
+
+ for (p=0; p<da->array_length; p++) {
+ if (da->field[p] == NULL) {
+ LOGERR("ERROR: field %lu of %lu is NULL.\n", p, da->array_length);
+ res = false;
+ repeated_error++;
+ if (repeated_error > 5)
+ return res;
+ }
+ }
+
+ return res;
+}
+
+
+bool verify_list_collection(struct list_collection *lc) {
+ bool res = true;
+ int i;
+ size_t count;
+ struct list_node *ptr;
+
+ for (i=0; i<256; i++) {
+ count = 0;
+ ptr = lc->heads[i]->first_node;
+
+ while (ptr != NULL) {
+ count++;
+ ptr = ptr->next;
+ }
+
+ LOGERR("DEBUG: list %3d/0x%02X: size %lu, counted %lu => does%s match.\n",
+ i, i, lc->heads[i]->size, count,
+ (lc->heads[i]->size != count)?" NOT":"");
+ if (lc->heads[i]->size != count) {
+ res = false;
+ }
+ }
+
+ return res;
+}
+
+
+#ifdef DEBUGBUILD
+void print_collection_state(struct list_collection *coll) {
+ int i = 0;
+ struct list_head *h = NULL;
+ struct list_node *ptr = NULL;
+
+ if (coll == NULL) {
+ LOGERR("ERROR: No collection given\n");
+ return;
+ }
+
+ for (i=0; i<256; i++) {
+ h = coll->heads[i];
+ ptr = h->first_node;
+ if (h->size == 0 || ptr == NULL) {
+ if (!(h->size == 0 && ptr == NULL)) {
+ DBGTRC("DEBUG: ERROR item %3d not matching size %lu, first_node %p\n", i, h->size, h->first_node);
+ }
+ continue;
+ }
+ DBGTRC("DEBUG: List %3d, size %lu, first_node %p\n", i, h->size, h->first_node);
+ }
+
+}
+#endif
+
+int main(int argc, char **argv) {
+ FILE *output = NULL;
+ int i = 0;
+ size_t s = 0;
+ struct list_collection *collection;
+ struct data_array *array;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: %s output raw_data filter_files...\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ s = strlen(argv[1]);
+ if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) {
+ output = stdout;
+ } else {
+ if ((output=fopen(argv[1], "w")) == NULL) {
+ LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n",
+ argv[1], strerror(errno), errno);
+ return EXIT_FAILURE;
+ }
+ }
+
+ LOGERR("IMPORT FILE %s\n", argv[2]);
+ collection = import_file_into_collection(argv[2]);
+ if (collection == NULL) {
+ return EXIT_FAILURE;
+ }
+/* TODO: Hide this behind some option
+ if ( ! verify_list_collection(collection) ) {
+ return EXIT_FAILURE;
+ }
+*/
+ array = convert_collection_to_array(collection);
+ if (array == NULL) {
+ return EXIT_FAILURE;
+ }
+ collection = NULL;
+
+/* TODO: Hide this behind an option
+ if ( ! verify_data_array(array) ) {
+ return EXIT_FAILURE;
+ }
+*/
+
+ for (i=3; i<argc; i++) {
+ LOGERR("APPLY FILTER FILE %s\n", argv[i]);
+ if (!run_filter_file_on_array(array, argv[i])) {
+ LOGERR("ERROR: Aborting.\n");
+ return EXIT_FAILURE;
+ }
+ }
+ DBGTRC("DEBUG: Filter applied, final result:\n");
+ fprint_array(output, array);
+ fflush(output);
+ fclose(output);
+
+ destroy_data_array(array);
+
+ return EXIT_SUCCESS;
+}
+