/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * Compare two files with a single SHA1 hash per line * useful when comparing filesystem content: * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt * awk ... * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include #include "output.h" #include "hex_conversion.h" #include "time_utils.h" #ifndef LINELENGTH #define LINELENGTH 1024 #endif /* 8 bytes pointer to a smaller char array? As there are strict conditions in * regards to the data which will be processed work with fix size arrays and * do math hard coded at compile time. */ #define NON_CONVERTED_SIZE 40 #define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) struct data_field { unsigned char data[FULL_DATABLOCK_SIZE]; /* keep this as first element for bsearch */ #ifdef RAW_DATA_IN_MEMORY /* In order to work with streams, the filters and not the raw data is loaded, therefore * the field is not required. Keeping it as brain trigger in case this changes in the * future. */ bool active; #endif }; struct data_array { struct data_field *fields; size_t length; size_t entry_points[256]; /* performance optimization reduces search area*/ }; void apply_filter_to_array(struct data_field *array, unsigned char *data); int compare_data_fields(const void *a, const void *b); int compare_string_data_field(const void *a, const void *b); void fprint_array(FILE *fd, struct data_array *array); struct data_array *import_file_into_array(char *filename); bool is_valid_raw_data(char *s, bool cut); bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output); bool search_data_in_array(struct data_array *da, unsigned char *data); void sort_array(struct data_array *da); /* === IMPLEMENTATION === */ /* bsearch, qsort etc. */ int compare_data_fields(const void *a, const void *b) { return memcmp(((struct data_field*)a)->data, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } int compare_string_data_field(const void *a, const void *b) { return memcmp((unsigned char*)a, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } inline bool search_data_in_array(struct data_array *array, unsigned char *data) { struct data_field *df; size_t entry = 0, rest_length = array->length; if (array == NULL || data == NULL) { LOGERR("ERROR: Missing argument%s%s\n", (array==NULL) ? " array" : "", (data ==NULL) ? " data" : ""); return false; } entry = array->entry_points[data[0]]; if (data[0] == 255 || array->entry_points[data[0]+1] == 0) { rest_length = array->length - entry; } else { rest_length = array->entry_points[data[0]+1] - entry; } DBGTRC("DEBUG: data[0] = %d (dec) / entry = %lu / rest_length = %lu\n", data[0], entry, rest_length); df = bsearch(data, &(array->fields[entry]), rest_length, sizeof(struct data_field), compare_string_data_field); /* df = bsearch(data, array->fields, array->length, sizeof(struct data_field), compare_string_data_field); */ if (df != NULL) { return true; } return false; } inline void sort_array(struct data_array *da) { size_t i = 0, p = 0; struct data_field *f; if (da == NULL) return; qsort(da->fields, da->length, sizeof(struct data_field), compare_data_fields); /* set entry points */ f = da->fields; for (i=1,p=0; ilength; i++, p++) { if (f[p].data[0] != f[i].data[0]) { da->entry_points[f[i].data[0]] = i; if (f[i].data[0] == 255) { break; } } } } inline bool is_valid_raw_data(char *s, bool cut) { size_t length = 0; if (s == NULL) return false; length = strlen(s); if (length < NON_CONVERTED_SIZE) return false; if (s[NON_CONVERTED_SIZE] == '\n') { length = NON_CONVERTED_SIZE; } else { while (isspace(s[length-1])) { length--; } if (length != NON_CONVERTED_SIZE) return false; } if ( ! ishex_string(s,NON_CONVERTED_SIZE)) return false; if (cut) s[length] = '\0'; return true; } struct data_array *import_file_into_array(char *filename) { FILE *fd = NULL; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; size_t line_nr = 0, fpos = 0; struct data_array *array = NULL; if (filename == NULL || filename[0] == '\0') { LOGERR("ERROR: no filename given.\n"); return NULL; } if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return NULL; } /* Count the number of valid lines */ while (fgets(line, LINELENGTH, fd) != NULL) { if (is_valid_raw_data(line, false)) { line_nr++; } } fclose(fd); /* only complete close and open worked reliably, let's just hope nobody modified the file * TODO: compare stat() mtime? */ if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return NULL; } if ((array=calloc(1,sizeof(struct data_array))) == NULL) { fclose(fd); LOGERR("ERROR: Failed to allocate <32 bytes...\n"); return NULL; } array->length = line_nr; if ((array->fields=calloc(array->length,sizeof(struct data_field))) == NULL) { fclose(fd); LOGERR("ERROR: Failed to allocate %lu bytes...\n", (array->length * sizeof(struct data_field))); return NULL; } DBGTRC("DEBUG: array->fields[] size is %lu bytesi\n", (array->length * sizeof(struct data_field))); while ((fgets(line, LINELENGTH, fd) != NULL) && fpos < array->length) { if ( ! is_valid_raw_data(line, true)) { continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data. '%s'\n", line_nr, line); free(array->fields); free(array); fclose(fd); return NULL; } memcpy(array->fields[fpos].data, data, FULL_DATABLOCK_SIZE); #ifdef RAW_DATA_IN_MEMORY array->fields[fpos].active = true; #endif fpos++; } DBGTRC("DEBUG: valid lines read %lu\n", line_nr); fclose(fd); return array; } bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output) { FILE *fd = NULL, *fdout = output; size_t line_nr = 0; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; if (da == NULL || filename == NULL || filename[0] == '\0') { LOGERR("ERROR: compromised arguments at call.\n"); return false; } if (output == NULL) fdout = stdout; if ((fd=fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return false; } /* Be nice and skip and not abort */ while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; DBGTRC("READ LINE %lu: %s", line_nr, line); if ( ! is_valid_raw_data(line, true) ) { continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); continue; } if ( ! search_data_in_array(da, data) ) { convert_line(line); /* for us only a toupper() loop */ fputs(line, fdout); fputc('\n', fdout); } } fclose(fd); return true; } void fprint_array(FILE *fd, struct data_array *da) { FILE *fdout = stdout; char plaintext[NON_CONVERTED_SIZE+1]; size_t pos; if (da == NULL) { return; } if (fd != NULL) { fdout = fd; } plaintext[NON_CONVERTED_SIZE] = '\0'; for (pos=0; pos < da->length; pos++) { convert_from_binary(da->fields[pos].data, FULL_DATABLOCK_SIZE, plaintext); fputs(plaintext, fdout); fputc('\n', fdout); } } int main(int argc, char **argv) { FILE *output = NULL; int i = 3; size_t s = 0; struct data_array *array; struct timespec t1, t2, tdiff; if (argc < 3) { fprintf(stderr, "Usage: %s output filter_file data_file...\n\n", argv[0]); fprintf(stderr, "Loads filters into memory, does NOT remove duplicates\n"); return EXIT_FAILURE; } s = strlen(argv[1]); if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { output = stdout; } else { if ((output=fopen(argv[1], "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", argv[1], strerror(errno), errno); return EXIT_FAILURE; } } LOGERR("IMPORT FILE %s\n", argv[2]); array = import_file_into_array(argv[2]); if (array == NULL) { return EXIT_FAILURE; } LOGERR("run qsort on in-memory data\n"); TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2, sort_array(array); ); difftime_timespec(t1, t2, &tdiff); LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec ); for (i=3; ifields); free(array); return EXIT_SUCCESS; }