/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * Compare two files with a single SHA1 hash per line * useful when comparing filesystem content: * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt * awk ... * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include #include "output.h" #include "hex_conversion.h" #ifndef LINELENGTH #define LINELENGTH 1024 #endif /* 8 bytes pointer to a smaller char array? As there are strict conditions in * regards to the data which will be processed work with fix size arrays and * do math hard coded at compile time. */ #define NON_CONVERTED_SIZE 40 #define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) struct data_field { unsigned char data[FULL_DATABLOCK_SIZE]; /* keep this as first element for bsearch */ bool active; }; struct list_node { struct list_node *next; struct data_field *field; }; struct list_head { struct list_node *first_node; struct list_node *last_node; size_t size; }; struct list_collection { struct list_head *heads[256]; }; struct data_array { struct data_field **field; size_t entry_points[256]; /* keep constant */ size_t opt_entry_points[256]; /* increase if referred target is deactivated */ size_t array_length; }; void apply_filter_to_array(struct data_array *array, unsigned char *data); int compare_data_fields(const void *a, const void *b); struct data_array *convert_collection_to_array(struct list_collection *collection); struct list_collection *create_collection(); struct list_head *create_list_head(); struct list_node *create_list_node(unsigned char *data); void destroy_collection(struct list_collection *c); void destroy_data_array(struct data_array *da); void destroy_list(struct list_head *head); void fprint_array(FILE *fd, struct data_array *da); struct list_collection *import_file_into_collection(char *filename); bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data); bool run_filter_file_on_array(struct data_array *da, char *filename); bool search_data_in_array(struct data_array *da, unsigned char *data); bool verify_data_array(struct data_array *da); bool verify_list_collection(struct list_collection *lc); #ifdef DEBUGBUILD void print_collection_state(struct list_collection *coll); #endif /* === IMPLEMENTATION === */ inline struct list_collection *create_collection() { int i; struct list_collection *c; if ((c=calloc(1, sizeof(struct list_collection))) == NULL) { return NULL; } for (i=0; i<256; i++) { c->heads[i] = create_list_head(); if (c->heads[i] == NULL) { break; } } DBGTRC("DEBUG: %d lists initialized%s\n", i, (i<256)?", aborting as not all 256 where" : ""); return c; } inline struct list_head *create_list_head() { struct list_head *head = calloc(1, sizeof(struct list_head)); if (head == NULL) { /* If the following masked fprintf calls fail there was a chance */ LOGERR("ERROR: Failed to allocate memory for list_head: "); fputs(strerror(errno), stderr); fputc('\n', stderr); return NULL; } head->first_node = NULL; head->last_node = NULL; head->size = 0; return head; } /* bsearch and similar */ int compare_data_fields(const void *a, const void *b) { return memcmp(((struct data_field*)a)->data, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } inline bool search_data_in_array(struct data_array *array, unsigned char *data) { struct data_field *df, tmp_df; memcpy(tmp_df.data, data, FULL_DATABLOCK_SIZE); /* Currently not working */ df = bsearch(&tmp_df, array->field, array->array_length, sizeof(struct data_field), compare_data_fields); if (df != NULL) { return true; } return false; } inline void destroy_collection(struct list_collection *c) { int i; if (c == NULL) return; for (i=0; i<256; i++) { destroy_list(c->heads[i]); } free(c); } inline void destroy_list(struct list_head *head) { struct list_node *ptr=NULL; if (head == NULL) return; while (head->first_node != NULL) { ptr = head->first_node; head->first_node = head->first_node->next; if (ptr->field != NULL) free(ptr->field); free(ptr); } free(head); head = NULL; } inline void destroy_data_array(struct data_array *da) { size_t i=0; if (da == NULL) return; for (;iarray_length; i++) { if (da->field[i] != NULL) { free(da->field[i]); } } free(da->field); free(da); } inline struct list_node *create_list_node(unsigned char *data) { struct list_node *node; if (data == NULL) return NULL; if ((node=calloc(1,sizeof(struct list_node))) == NULL) { LOGERR("ERROR: Failed to allocate a few bytes for new list_node\n"); return NULL; } if ((node->field=calloc(1,sizeof(struct data_field))) == NULL) { LOGERR("ERROR: Failed to allocate a few bytes for new data_field\n"); free(node); return NULL; } node->next = NULL; node->field->active = true; memcpy(node->field->data, data, FULL_DATABLOCK_SIZE); return node; } /* If the input data's already sorted best performance, worst case (O^n)/256 if it's reverse sorted. */ inline bool insert_into_sorted_collection(struct list_collection *collection, unsigned char *data) { struct list_node *node_ptr = NULL, *tmp_node = NULL; struct list_head *head_ptr = NULL; int cmp = 0; #ifdef DEBUGBUILD static size_t fcall_count = 0; #endif if (collection == NULL || data == NULL) { DBGTRC("ERROR: collection set%s / data set%s\n", ((collection==NULL) ? " NULL" : ""), ((data==NULL) ? " NULL" : "")); return false; } #ifdef DEBUGBUILD DBGTRC("DEBUG: fcall_count = %lu / data[0]=0x%02X / decimal: data[0]=%3u\n", ++fcall_count, data[0], (unsigned int)data[0]); #endif head_ptr = collection->heads[data[0]]; if (head_ptr->first_node == NULL) { DBGTRC("DEBUG: First node for list %d\n", data[0]); if ((tmp_node = create_list_node(data)) == NULL) { return false; } head_ptr->first_node = tmp_node; head_ptr->last_node = tmp_node; head_ptr->size = 1; return true; } cmp = memcmp(data, head_ptr->last_node->field->data, FULL_DATABLOCK_SIZE); if (cmp == 0) { /* duplicate, skip */ return true; } /* Append to list */ if (cmp > 0) { if ((tmp_node = create_list_node(data)) == NULL) { return false; } head_ptr->last_node->next = tmp_node; head_ptr->last_node = tmp_node; head_ptr->size++; return true; } node_ptr = head_ptr->first_node; cmp = memcmp(data, node_ptr->field->data, FULL_DATABLOCK_SIZE); if (cmp == 0) { return true; } /* First to enter? Prepend to list*/ if (cmp < 0) { if ((tmp_node = create_list_node(data)) == NULL) { return false; } tmp_node->next = head_ptr->first_node; head_ptr->first_node = tmp_node; head_ptr->size++; return true; } /* Sloooow part... */ while (node_ptr->next != NULL) { /* check next in list */ cmp = memcmp(data, node_ptr->next->field->data, FULL_DATABLOCK_SIZE); if (cmp == 0) { return true; } /* insert by prepend */ if (cmp > 0) { if ((tmp_node = create_list_node(data)) == NULL) { return false; } tmp_node->next = node_ptr->next; node_ptr->next = tmp_node; head_ptr->size++; return true; } /* continue */ node_ptr = node_ptr->next; } LOGERR("ERROR: Could not insert dataset into list, missed possible situation...\n"); return false; } inline struct data_array *convert_collection_to_array(struct list_collection *collection) { int i; size_t s; struct data_array *da = NULL; struct list_node *ptr, *tmp; if (collection == NULL) { LOGERR("ERROR: No collection given.\n"); return NULL; } s = 0; for (i=0; i<256; i++) { #ifdef DEBUGBUILD if (collection->heads[i]->size > 0) { DBGTRC("DEBUG: collections->heads[%d]->size = %lu\n", i, collection->heads[i]->size); } #endif s += collection->heads[i]->size; DBGTRC("DEBUG: collection->heads[%d]->size = %lu\n", i, collection->heads[i]->size); } DBGTRC("DEBUG: Total size %lu\n", s); if (s == 0) { LOGERR("ERROR: Empty collection."); return NULL; } if ((da = calloc(1, sizeof(struct data_array))) == NULL) { LOGERR("ERROR: Failed to allocate %lu bytes...\n", sizeof(struct data_array)); return NULL; } da->array_length = s; if ((da->field=calloc(da->array_length, sizeof(struct data_field*))) == NULL) { free(da); LOGERR("ERROR: Failed to allocate memory for metadata\n"); return NULL; } s = 0; for (i=0; i<256; i++) { DBGTRC("DEBUG: i=%d / s=%lu\n", i, s); ptr = collection->heads[i]->first_node; da->entry_points[i] = s; da->opt_entry_points[i] = s; #ifdef DEBUGBUILD if (ptr != NULL) { DBGTRC("DEBUG: list %d in collection, claims size %lu\n", i, collection->heads[i]->size); } #endif while (ptr != NULL) { DBGTRC("DEBUG: Setting field %lu to %s\n", s, (ptr->field == NULL) ? "NULL" : "valid value"); da->field[s] = ptr->field; s += 1; /* field[s++] failed... compiler optimization? */ tmp = ptr->next; ptr->next = NULL; free(ptr); ptr = tmp; } collection->heads[i]->first_node = NULL; collection->heads[i]->last_node = NULL; collection->heads[i]->size = 0; } destroy_collection(collection); return da; } /* This is acceptable if the largest can be hold in memory, but not the other way */ inline void apply_filter_to_array(struct data_array *array, unsigned char *data) { size_t pos=0, limit = 0; int cmp; if (array==NULL || data==NULL) { return; } /* Manual search with entry_points */ if (data[0] == 255) { limit = array->array_length; } else { limit = array->entry_points[data[0]+1]; } pos=array->opt_entry_points[data[0]]; DBGTRC("DEBUG: opt_entry_point=%-8lu / limit=%-8lu\n", pos, limit); DBGTRC("DEBUG: data[0]=0x%02X / decimal: data[0]=%3u\n", data[0], (unsigned int)data[0]); /* TODO: Evaluate whether this loop should be handlued with openmp */ for (; posfield[pos]->active) { /* perf optimization, single byte faster than memcmp on 20 bytes */ if (data[1] < array->field[pos]->data[1]) { continue; } cmp = memcmp(data, array->field[pos]->data, FULL_DATABLOCK_SIZE); DBGTRC("cmp: %d\n", cmp); if (cmp == 0) { array->field[pos]->active = false; if (pos == array->opt_entry_points[data[0]]) { array->opt_entry_points[data[0]]++; } return; } #if 1 else if (data[1] > array->field[pos]->data[1]) { return; } #else /* Currently I don't trust this memcmp result 100%. testdata created in shell * for i in $(seq 1 20000); do echo "foo_$i" | sha1sum | awk ' { print $1; } ' ; done * picking a random subset from there to be used as filters lead to sometimes leaving * the loop too early with this else... Comparing only the second byte may is also too * vague with really large datasets */ else if (cmp < 0) { /* array content is sorted, so abort once the possible match has been passed */ return; } #endif } } } struct list_collection *import_file_into_collection(char *filename) { FILE *fd = NULL; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; size_t line_nr = 0; struct list_collection *collection = NULL; if (filename == NULL || filename[0] == '\0') { LOGERR("ERROR: no filename given.\n"); return NULL; } if ((collection = create_collection()) == NULL) { LOGERR("ERROR: Failed to allocate a few bytes for collection.\n"); return NULL; } if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); destroy_collection(collection); return NULL; } while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; DBGTRC("DEBUG: line %lu = %s", line_nr, line); if (convert_line(line)<0) { LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); destroy_collection(collection); fclose(fd); return NULL; } if (convert_to_binary(line, data) == NULL) { destroy_collection(collection); fclose(fd); LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); return NULL; } if (!insert_into_sorted_collection(collection, data)) { destroy_collection(collection); fclose(fd); LOGERR("ERROR: Failed to insert line %lu in binary form into collection.\n", line_nr); return NULL; } } DBGTRC("DEBUG: lines read %lu\n", line_nr); fclose(fd); return collection; } bool run_filter_file_on_array(struct data_array *da, char *filename) { FILE *fd = NULL; size_t line_nr = 0; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; if (da == NULL || filename == NULL || filename[0] == '\0') { LOGERR("ERROR: compromised arguments at call.\n"); return false; } if ((fd=fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return false; } /* Be nice and skip and not abort */ while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; DBGTRC("READ LINE %lu: %s", line_nr, line); if (convert_line(line)<0) { LOGERR("ERROR: Failed to prepare line %lu for conversion.\n", line_nr); continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); continue; } apply_filter_to_array(da, (unsigned char*)data); } fclose(fd); return true; } void fprint_array(FILE *fd, struct data_array *da) { FILE *fdout = stdout; char *plaintext = NULL; size_t pos; if (da == NULL) { return; } if (fd != NULL) { fdout = fd; } DBGTRC("DEBUG: da->array_length = %lu\n", da->array_length); for (pos=0; pos < da->array_length; pos++) { DBGTRC("DEBUG: pos %lu --- %s\n", pos, (da->field[pos] == NULL)? "NULL" : "element exists"); if (da->field[pos]->active) { plaintext = convert_from_binary(da->field[pos]->data, FULL_DATABLOCK_SIZE, plaintext); fputs(plaintext, fdout); fputc('\n', fdout); } } if (plaintext != NULL) { free(plaintext); }; } /* To be used on a new array */ bool verify_data_array(struct data_array *da) { bool res = true; size_t p = 0; int repeated_error = 0; if (da == NULL) { LOGERR("No array given...\n"); return false; } /* Check entry points */ for (p=0; p<256; p++) { if (da->entry_points[p] != da->opt_entry_points[p]) { LOGERR("MISMATCH entry_points[%lu] <-> opt_entry_points[%lu]\n", p, p); res = false; } } for (p=0; parray_length; p++) { if (da->field[p] == NULL) { LOGERR("ERROR: field %lu of %lu is NULL.\n", p, da->array_length); res = false; repeated_error++; if (repeated_error > 5) return res; } } return res; } bool verify_list_collection(struct list_collection *lc) { bool res = true; int i; size_t count; struct list_node *ptr; for (i=0; i<256; i++) { count = 0; ptr = lc->heads[i]->first_node; while (ptr != NULL) { count++; ptr = ptr->next; } LOGERR("DEBUG: list %3d/0x%02X: size %lu, counted %lu => does%s match.\n", i, i, lc->heads[i]->size, count, (lc->heads[i]->size != count)?" NOT":""); if (lc->heads[i]->size != count) { res = false; } } return res; } #ifdef DEBUGBUILD void print_collection_state(struct list_collection *coll) { int i = 0; struct list_head *h = NULL; struct list_node *ptr = NULL; if (coll == NULL) { LOGERR("ERROR: No collection given\n"); return; } for (i=0; i<256; i++) { h = coll->heads[i]; ptr = h->first_node; if (h->size == 0 || ptr == NULL) { if (!(h->size == 0 && ptr == NULL)) { DBGTRC("DEBUG: ERROR item %3d not matching size %lu, first_node %p\n", i, h->size, h->first_node); } continue; } DBGTRC("DEBUG: List %3d, size %lu, first_node %p\n", i, h->size, h->first_node); } } #endif int main(int argc, char **argv) { FILE *output = NULL; int i = 0; size_t s = 0; struct list_collection *collection; struct data_array *array; if (argc < 3) { fprintf(stderr, "Usage: %s output raw_data filter_files...\n", argv[0]); return EXIT_FAILURE; } s = strlen(argv[1]); if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { output = stdout; } else { if ((output=fopen(argv[1], "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", argv[1], strerror(errno), errno); return EXIT_FAILURE; } } LOGERR("IMPORT FILE %s\n", argv[2]); collection = import_file_into_collection(argv[2]); if (collection == NULL) { return EXIT_FAILURE; } /* TODO: Hide this behind some option if ( ! verify_list_collection(collection) ) { return EXIT_FAILURE; } */ array = convert_collection_to_array(collection); if (array == NULL) { return EXIT_FAILURE; } collection = NULL; /* TODO: Hide this behind an option if ( ! verify_data_array(array) ) { return EXIT_FAILURE; } */ for (i=3; i