/* SPDX-License-Identifier: Apache-2.0 */ /* Copyright 2025 Thorsten Töpper * * Compare two files with a single SHA1 hash per line * useful when comparing filesystem content: * find fs_to_check -type f -exec sha1sum "{}" \; > output_1.txt * awk ... * * vim:ts=4:sw=4:expandtab */ #include #include #include #include #include #include #include #include #include #include #include #include #include "trace_macros.h" #include "hex_conversion.h" #include "time_utils.h" #ifndef LINELENGTH #define LINELENGTH 1024 #endif /* 8 bytes pointer to a smaller char array? As there are strict conditions in * regards to the data which will be processed work with fix size arrays and * do math hard coded at compile time. */ #define NON_CONVERTED_SIZE 40 #define FULL_DATABLOCK_SIZE (NON_CONVERTED_SIZE/2) struct data_field { unsigned char data[FULL_DATABLOCK_SIZE]; /* keep this as first element for bsearch */ #ifdef RAW_DATA_IN_MEMORY /* In order to work with streams, the filters and not the raw data is loaded, therefore * the field is not required. Keeping it as brain trigger in case this changes in the * future. */ bool active; #endif }; struct data_array { struct data_field *fields; size_t length; size_t entry_points[256]; /* performance optimization reduces search area*/ }; void apply_filter_to_array(struct data_field *array, unsigned char *data); int compare_data_fields(const void *a, const void *b); int compare_string_data_field(const void *a, const void *b); bool dump_array(char *target_filename, struct data_array *da); void fprint_array(FILE *fd, struct data_array *array); struct data_array *import_file_into_array(char *filename); bool is_valid_raw_data(char *s, bool cut); struct data_array *load_dumped_array(char *dump_filename, char *plain_filename); bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output); bool search_data_in_array(struct data_array *da, unsigned char *data); void set_entry_points(struct data_array *da); void sort_array(struct data_array *da); /* === IMPLEMENTATION === */ /* bsearch, qsort etc. */ int compare_data_fields(const void *a, const void *b) { return memcmp(((struct data_field*)a)->data, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } int compare_string_data_field(const void *a, const void *b) { return memcmp((unsigned char*)a, ((struct data_field*)b)->data, FULL_DATABLOCK_SIZE); } inline bool search_data_in_array(struct data_array *array, unsigned char *data) { struct data_field *df; size_t entry = 0, rest_length = array->length; if (array == NULL || data == NULL) { LOGERR("ERROR: Missing argument%s%s\n", (array==NULL) ? " array" : "", (data ==NULL) ? " data" : ""); return false; } entry = array->entry_points[data[0]]; if (data[0] == 255 || array->entry_points[data[0]+1] == 0) { rest_length = array->length - entry; } else { rest_length = array->entry_points[data[0]+1] - entry; } DBGTRC("DEBUG: data[0] = %d (dec) / entry = %lu / rest_length = %lu\n", data[0], entry, rest_length); df = bsearch(data, &(array->fields[entry]), rest_length, sizeof(struct data_field), compare_string_data_field); /* df = bsearch(data, array->fields, array->length, sizeof(struct data_field), compare_string_data_field); */ if (df != NULL) { return true; } return false; } inline void sort_array(struct data_array *da) { if (da == NULL) return; qsort(da->fields, da->length, sizeof(struct data_field), compare_data_fields); set_entry_points(da); } inline void set_entry_points(struct data_array *da) { size_t i = 0, p = 0; struct data_field *f = da->fields; for (i=1,p=0; ilength; i++, p++) { if (f[p].data[0] != f[i].data[0]) { da->entry_points[f[i].data[0]] = i; if (f[i].data[0] == 255) { break; } } } } inline bool is_valid_raw_data(char *s, bool cut) { size_t length = 0; if (s == NULL) return false; length = strlen(s); if (length < NON_CONVERTED_SIZE) return false; if (s[NON_CONVERTED_SIZE] == '\n') { length = NON_CONVERTED_SIZE; } else { while (isspace(s[length-1])) { length--; } if (length != NON_CONVERTED_SIZE) return false; } if ( ! ishex_string(s,NON_CONVERTED_SIZE)) return false; if (cut) s[length] = '\0'; return true; } /* Performance improvement dump the array into an binary file. * When loading it compare the mtime on the FS. As long as the binary * is in a more recent state import the data from there. Check in the * calling function whether the dump was reloaded, if so skip another * dump. * The dump is mapped via mmap into the memory and NOT directly loaded * with an fread loop or similar. So the program relies on the FS cache * when set_entry_points is called. */ inline bool dump_array(char *target_filename, struct data_array *da) { FILE *fd = NULL; size_t written = 0; if (target_filename == NULL || da == NULL || da->fields == NULL) { LOGERR("ERROR: target_filename %s / data_array %s / data_field %s\n", ((target_filename==NULL) ? "NULL" : target_filename), ((da==NULL) ? "NULL" : "given"), ((da->fields==NULL) ? "NULL" : "given")); return false; } if ((fd=fopen(target_filename, "w")) == NULL) { LOGERR("ERROR: Failed to open file %s to write binary: %s (errno %d)\n", target_filename, strerror(errno), errno); return false; } /* for performance no check. Instead afterwards a check whether the * file is as large as expected */ written = fwrite(da->fields, sizeof(struct data_field), da->length, fd); DBGTRC("DEBUG: Written %lu elements\n", written); fflush(fd); fclose(fd); if (written != da->length) { LOGERR("ERROR: Failed to write %lu elements, wrote %lu\n", da->length, written); if (unlink(target_filename) != 0) { LOGERR("ERROR: failed to remove file %s: %s (errno %d)\n", target_filename, strerror(errno), errno); } return false; } return true; } struct data_array *load_dumped_array(char *dump_filename, char *plain_filename) { int fdin=-1; size_t fsize = 0; struct data_array *da = NULL; struct stat stat_dump, stat_plain; if (dump_filename == NULL || plain_filename == NULL) { LOGERR("ERROR: dump_filename %s / plain_filename %s\n", ((dump_filename==NULL)?"NULL":dump_filename), ((plain_filename==NULL)?"NULL":plain_filename)); return NULL; } if (stat(dump_filename, &stat_dump) != 0) { LOGERR("ERROR: failed to get stat() data on %s: %s (errno %d)\n", dump_filename, strerror(errno), errno); return NULL; } if (stat(plain_filename, &stat_plain) != 0) { LOGERR("ERROR: failed to get stat() data on %s: %s (errno %d)\n", plain_filename, strerror(errno), errno); return NULL; } if (stat_plain.st_mtim.tv_sec >= stat_dump.st_mtim.tv_sec) { return NULL; } if ((da=calloc(1,sizeof(struct data_array))) == NULL) { LOGERR("ERROR: Failed to allocate a few bytes.\n"); return NULL; } if (stat_dump.st_size <= 0) { free(da); return NULL; } fsize = (size_t)stat_dump.st_size; da->length = fsize/sizeof(struct data_field); if ((fdin=open(dump_filename, O_RDONLY )) < 0) { LOGERR("ERROR: Failed to open file %s to read binary: %s (errno %d)\n", dump_filename, strerror(errno), errno); free(da); return NULL; } da->fields = mmap(0, fsize, PROT_READ, MAP_PRIVATE, fdin, 0); if (da->fields == MAP_FAILED) { LOGERR("ERROR: Failed to map file %s into memory: %s (errno %d)\n", dump_filename, strerror(errno), errno); close(fdin); free(da); return NULL; } set_entry_points(da); close(fdin); return da; } struct data_array *import_file_into_array(char *filename) { FILE *fd = NULL; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; size_t line_nr = 0, fpos = 0; struct data_array *array = NULL; if (filename == NULL || filename[0] == '\0') { LOGERR("ERROR: no filename given.\n"); return NULL; } if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return NULL; } /* Count the number of valid lines */ while (fgets(line, LINELENGTH, fd) != NULL) { if (is_valid_raw_data(line, false)) { line_nr++; } } fclose(fd); DBGTRC("DEBUG: valid lines %lu\n", line_nr); /* only complete close and open worked reliably, let's just hope nobody modified the file * TODO: compare stat() mtime? */ if ((fd = fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return NULL; } if ((array=calloc(1,sizeof(struct data_array))) == NULL) { fclose(fd); LOGERR("ERROR: Failed to allocate <32 bytes...\n"); return NULL; } array->length = line_nr; if ((array->fields=calloc(array->length,sizeof(struct data_field))) == NULL) { fclose(fd); LOGERR("ERROR: Failed to allocate %lu bytes...\n", (array->length * sizeof(struct data_field))); return NULL; } DBGTRC("DEBUG: array->fields[] size is %lu bytesi\n", (array->length * sizeof(struct data_field))); while ((fgets(line, LINELENGTH, fd) != NULL) && fpos < array->length) { if ( ! is_valid_raw_data(line, true)) { continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data. '%s'\n", line_nr, line); free(array->fields); free(array); fclose(fd); return NULL; } memcpy(array->fields[fpos].data, data, FULL_DATABLOCK_SIZE); #ifdef RAW_DATA_IN_MEMORY array->fields[fpos].active = true; #endif fpos++; } DBGTRC("DEBUG: valid lines read %lu\n", line_nr); fclose(fd); return array; } bool run_file_on_filter_array(struct data_array *da, char *filename, FILE *output) { FILE *fd = NULL, *fdout = output; size_t line_nr = 0; char line[LINELENGTH]; unsigned char data[FULL_DATABLOCK_SIZE]; if (da == NULL || filename == NULL || filename[0] == '\0') { LOGERR("ERROR: compromised arguments at call.\n"); return false; } if (output == NULL) fdout = stdout; if ((fd=fopen(filename, "r")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", filename, strerror(errno), errno); return false; } /* Be nice and skip and not abort */ while (fgets(line, LINELENGTH, fd) != NULL) { line_nr++; DBGTRC("READ LINE %lu: %s", line_nr, line); if ( ! is_valid_raw_data(line, true) ) { continue; } if (convert_to_binary(line, data) == NULL) { LOGERR("ERROR: Failed to convert line %lu into binary data.\n", line_nr); continue; } if ( ! search_data_in_array(da, data) ) { convert_line(line); /* for us only a toupper() loop */ fputs(line, fdout); fputc('\n', fdout); } } fclose(fd); return true; } void fprint_array(FILE *fd, struct data_array *da) { FILE *fdout = stdout; char plaintext[NON_CONVERTED_SIZE+1]; size_t pos; if (da == NULL) { return; } if (fd != NULL) { fdout = fd; } plaintext[NON_CONVERTED_SIZE] = '\0'; for (pos=0; pos < da->length; pos++) { convert_from_binary(da->fields[pos].data, FULL_DATABLOCK_SIZE, plaintext); fputs(plaintext, fdout); fputc('\n', fdout); } } int main(int argc, char **argv) { FILE *output = NULL; int first_data = 2, opt = 0, filter_index = 1, output_index = 0; size_t s = 0; bool work_with_dump = false, map_from_dump = false; struct data_array *array; struct timespec t1, t2, tdiff; struct stat stat_dump, stat_plain; char dump_fname[4096] = ""; if (argc < 3) { fprintf(stderr, "Usage: %s output filter_file data_file...\n\n", argv[0]); fprintf(stderr, "or %s -b output filter_file data_file...\n\n", argv[0]); fprintf(stderr, "Loads filters into memory, does NOT remove duplicates\n"); fprintf(stderr, "The -b argument loads and/or dumps to filter_file.dump\n" "in case the file is older than the dump.\n"); return EXIT_FAILURE; } memset(dump_fname, '\0', 4096); while ((opt = getopt(argc, argv, "b")) != -1) { switch (opt) { case 'b': work_with_dump = true; map_from_dump = true; DBGTRC("DEBUG: enabled mapping and dumping\n"); break; default: LOGERR("ERROR: Unknown option %c.\n", opt); exit(EXIT_FAILURE); }; } output_index += optind; filter_index += optind; first_data += optind; DBGTRC("DEBUG: filter index %d / i %d / optind %d / output_index %d\n", filter_index, first_data, optind, output_index); DBGTRC("DEBUG: output_file %s\n", argv[output_index]); DBGTRC("DEBUG: filter_file %s\n", argv[filter_index]); DBGTRC("DEBUG: first data %s\n", argv[first_data]); /* Memory dump and map check */ if (work_with_dump) { memcpy(dump_fname, argv[filter_index], strlen(argv[filter_index])); s = strlen(dump_fname); if (s > 0 && s < 4089) { dump_fname[s] = '.'; dump_fname[s+1] = 'd'; dump_fname[s+2] = 'u'; dump_fname[s+3] = 'm'; dump_fname[s+4] = 'p'; dump_fname[s+5] = '\0'; } else { work_with_dump = false; map_from_dump = false; } if (stat(dump_fname, &stat_dump) != 0) { /* First time there's no dump */ DBGTRC("DEBUG: deactivate mapping due to stat failed on dump '%s'\n", dump_fname); map_from_dump = false; } if (stat(argv[filter_index], &stat_plain) != 0) { LOGERR("ERROR: failed to get stat() data on %s: %s (errno %d)\n", argv[filter_index], strerror(errno), errno); exit(EXIT_FAILURE); } if (stat_plain.st_mtim.tv_sec >= stat_dump.st_mtim.tv_sec) { map_from_dump = false; } } s = strlen(argv[output_index]); if (s == 6 && (strncmp("stdout", argv[1], 6) == 0)) { output = stdout; work_with_dump = false; } else { if ((output=fopen(argv[output_index], "w")) == NULL) { LOGERR("ERROR: Failed to open file '%s': %s (errno %d)\n", argv[output_index], strerror(errno), errno); return EXIT_FAILURE; } } if (map_from_dump) { LOGERR("MAP from file %s\n", dump_fname); array = load_dumped_array(dump_fname, argv[filter_index]); if (array == NULL) { return EXIT_FAILURE; } } else { LOGERR("IMPORT FILE %s\n", argv[filter_index]); array = import_file_into_array(argv[filter_index]); if (array == NULL) { return EXIT_FAILURE; } LOGERR("run qsort on in-memory data\n"); TU_MEASURE_TIME( CLOCK_PROCESS_CPUTIME_ID, &t1, &t2, sort_array(array); ); difftime_timespec(t1, t2, &tdiff); LOGERR("Sorted in %lu s and %lu ns process CPU time\n", tdiff.tv_sec, tdiff.tv_nsec ); } if (work_with_dump && (stat_plain.st_mtim.tv_sec >= stat_dump.st_mtim.tv_sec)) { LOGERR("DUMP filter to %s\n", dump_fname); dump_array(dump_fname, array); } for (; first_datafields); } else { munmap(array->fields, (array->length*sizeof(struct data_field))); } free(array); return EXIT_SUCCESS; }